diff options
86 files changed, 3250 insertions, 2929 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index f1366475c389..e841ed781a25 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -115,24 +115,3 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } - -void * -kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) -{ - int retries = 0; - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_); - do { - ptr = kmem_cache_alloc(zone, lflags); - if (ptr || (flags & KM_MAYFAIL)) - return ptr; - if (!(++retries % 100)) - xfs_err(NULL, - "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", - current->comm, current->pid, - __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } while (1); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 34cbcfde9228..8e8555817e6d 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -85,14 +85,6 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) #define kmem_zone kmem_cache #define kmem_zone_t struct kmem_cache -extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); - -static inline void * -kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) -{ - return kmem_zone_alloc(zone, flags | KM_ZERO); -} - static inline struct page * kmem_to_page(void *addr) { diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 9d84007a5c65..8cf73fe4338e 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -563,7 +563,8 @@ xfs_ag_get_geometry( error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp); if (error) goto out_agi; - pag = xfs_perag_get(mp, agno); + + pag = agi_bp->b_pag; /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); @@ -583,7 +584,6 @@ xfs_ag_get_geometry( xfs_ag_geom_health(pag, ageo); /* Release resources. */ - xfs_perag_put(pag); xfs_buf_relse(agf_bp); out_agi: xfs_buf_relse(agi_bp); diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index f3fd0ee9a7f7..8a8eb4bc48bb 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -37,16 +37,4 @@ xfs_ag_resv_rmapbt_alloc( xfs_perag_put(pag); } -static inline void -xfs_ag_resv_rmapbt_free( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, agno); - xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); - xfs_perag_put(pag); -} - #endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 203e74fa64aa..852b536551b5 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -710,13 +710,12 @@ xfs_alloc_read_agfl( STATIC int xfs_alloc_update_counters( struct xfs_trans *tp, - struct xfs_perag *pag, struct xfs_buf *agbp, long len) { struct xfs_agf *agf = agbp->b_addr; - pag->pagf_freeblks += len; + agbp->b_pag->pagf_freeblks += len; be32_add_cpu(&agf->agf_freeblks, len); xfs_trans_agblocks_delta(tp, len); @@ -1175,8 +1174,7 @@ xfs_alloc_ag_vextent( } if (!args->wasfromfl) { - error = xfs_alloc_update_counters(args->tp, args->pag, - args->agbp, + error = xfs_alloc_update_counters(args->tp, args->agbp, -((long)(args->len))); if (error) return error; @@ -1887,7 +1885,6 @@ xfs_free_ag_extent( enum xfs_ag_resv_type type) { struct xfs_mount *mp; - struct xfs_perag *pag; struct xfs_btree_cur *bno_cur; struct xfs_btree_cur *cnt_cur; xfs_agblock_t gtbno; /* start of right neighbor */ @@ -2167,10 +2164,8 @@ xfs_free_ag_extent( /* * Update the freespace totals in the ag and superblock. */ - pag = xfs_perag_get(mp, agno); - error = xfs_alloc_update_counters(tp, pag, agbp, len); - xfs_ag_resv_free_extent(pag, type, tp, len); - xfs_perag_put(pag); + error = xfs_alloc_update_counters(tp, agbp, len); + xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); if (error) goto error0; @@ -2467,7 +2462,8 @@ xfs_defer_agfl_block( ASSERT(xfs_bmap_free_item_zone != NULL); ASSERT(oinfo != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); + new = kmem_cache_alloc(xfs_bmap_free_item_zone, + GFP_KERNEL | __GFP_NOFAIL); new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); new->xefi_blockcount = 1; new->xefi_oinfo = *oinfo; @@ -2689,7 +2685,7 @@ xfs_alloc_get_freelist( if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); @@ -2701,7 +2697,6 @@ xfs_alloc_get_freelist( pag->pagf_btreeblks++; logflags |= XFS_AGF_BTREEBLKS; } - xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; @@ -2797,7 +2792,7 @@ xfs_alloc_put_freelist( if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); @@ -2809,7 +2804,6 @@ xfs_alloc_put_freelist( pag->pagf_btreeblks--; logflags |= XFS_AGF_BTREEBLKS; } - xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); @@ -3006,7 +3000,7 @@ xfs_alloc_read_agf( ASSERT(!(*bpp)->b_error); agf = (*bpp)->b_addr; - pag = xfs_perag_get(mp, agno); + pag = (*bpp)->b_pag; if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); @@ -3034,7 +3028,6 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif - xfs_perag_put(pag); return 0; } diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 60c453cb3ee3..8e01231b308e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -38,16 +38,14 @@ xfs_allocbt_set_root( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); pag->pagf_levels[btnum] += inc; - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -115,7 +113,6 @@ xfs_allocbt_update_lastrec( int reason) { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); struct xfs_perag *pag; __be32 len; int numrecs; @@ -160,9 +157,8 @@ xfs_allocbt_update_lastrec( } agf->agf_longest = len; - pag = xfs_perag_get(cur->bc_mp, seqno); + pag = cur->bc_ag.agbp->b_pag; pag->pagf_longest = be32_to_cpu(len); - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST); } @@ -484,7 +480,7 @@ xfs_allocbt_init_common( ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 3b1bd6e112f8..2e055c079f39 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -46,6 +46,7 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); /* * Internal routines when attribute list is more than one block. @@ -53,6 +54,8 @@ STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC int xfs_attr_node_addname(xfs_da_args_t *args); STATIC int xfs_attr_node_removename(xfs_da_args_t *args); +STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, + struct xfs_da_state **state); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); @@ -175,8 +178,13 @@ xfs_attr_try_sf_addname( struct xfs_da_args *args) { - struct xfs_mount *mp = dp->i_mount; - int error, error2; + int error; + + /* + * Build initial attribute list (if required). + */ + if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) + xfs_attr_shortform_create(args); error = xfs_attr_shortform_addname(args); if (error == -ENOSPC) @@ -189,12 +197,70 @@ xfs_attr_try_sf_addname( if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (dp->i_mount->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args->trans); - error2 = xfs_trans_commit(args->trans); - args->trans = NULL; - return error ? error : error2; + return error; +} + +/* + * Check to see if the attr should be upgraded from non-existent or shortform to + * single-leaf-block attribute list. + */ +static inline bool +xfs_attr_is_shortform( + struct xfs_inode *ip) +{ + return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || + (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0); +} + +/* + * Attempts to set an attr in shortform, or converts short form to leaf form if + * there is not enough room. If the attr is set, the transaction is committed + * and set to NULL. + */ +STATIC int +xfs_attr_set_shortform( + struct xfs_da_args *args, + struct xfs_buf **leaf_bp) +{ + struct xfs_inode *dp = args->dp; + int error, error2 = 0; + + /* + * Try to add the attr to the attribute list in the inode. + */ + error = xfs_attr_try_sf_addname(dp, args); + if (error != -ENOSPC) { + error2 = xfs_trans_commit(args->trans); + args->trans = NULL; + return error ? error : error2; + } + /* + * It won't fit in the shortform, transform to a leaf block. GROT: + * another possible req'mt for a double-split btree op. + */ + error = xfs_attr_shortform_to_leaf(args, leaf_bp); + if (error) + return error; + + /* + * Prevent the leaf buffer from being unlocked so that a concurrent AIL + * push cannot grab the half-baked leaf buffer and run into problems + * with the write verifier. Once we're done rolling the transaction we + * can release the hold and add the attr to the leaf. + */ + xfs_trans_bhold(args->trans, *leaf_bp); + error = xfs_defer_finish(&args->trans); + xfs_trans_bhold_release(args->trans, *leaf_bp); + if (error) { + xfs_trans_brelse(args->trans, *leaf_bp); + return error; + } + + return 0; } /* @@ -206,61 +272,94 @@ xfs_attr_set_args( { struct xfs_inode *dp = args->dp; struct xfs_buf *leaf_bp = NULL; - int error; + int error = 0; /* - * If the attribute list is non-existent or a shortform list, - * upgrade it to a single-leaf-block attribute list. + * If the attribute list is already in leaf format, jump straight to + * leaf handling. Otherwise, try to add the attribute to the shortform + * list; if there's no room then convert the list to leaf format and try + * again. */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL || - (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - dp->i_afp->if_nextents == 0)) { + if (xfs_attr_is_shortform(dp)) { /* - * Build initial attribute list (if required). + * If the attr was successfully set in shortform, the + * transaction is committed and set to NULL. Otherwise, is it + * converted from shortform to leaf, and the transaction is + * retained. */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) - xfs_attr_shortform_create(args); + error = xfs_attr_set_shortform(args, &leaf_bp); + if (error || !args->trans) + return error; + } - /* - * Try to add the attr to the attribute list in the inode. - */ - error = xfs_attr_try_sf_addname(dp, args); + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_addname(args); if (error != -ENOSPC) return error; /* - * It won't fit in the shortform, transform to a leaf block. - * GROT: another possible req'mt for a double-split btree op. + * Promote the attribute list to the Btree format. */ - error = xfs_attr_shortform_to_leaf(args, &leaf_bp); + error = xfs_attr3_leaf_to_node(args); if (error) return error; /* - * Prevent the leaf buffer from being unlocked so that a - * concurrent AIL push cannot grab the half-baked leaf - * buffer and run into problems with the write verifier. - * Once we're done rolling the transaction we can release - * the hold and add the attr to the leaf. + * Finish any deferred work items and roll the transaction once + * more. The goal here is to call node_addname with the inode + * and transaction in the same state (inode locked and joined, + * transaction clean) no matter how we got to this step. */ - xfs_trans_bhold(args->trans, leaf_bp); error = xfs_defer_finish(&args->trans); - xfs_trans_bhold_release(args->trans, leaf_bp); - if (error) { - xfs_trans_brelse(args->trans, leaf_bp); + if (error) + return error; + + /* + * Commit the current trans (including the inode) and + * start a new one. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) return error; - } } - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) - error = xfs_attr_leaf_addname(args); - else - error = xfs_attr_node_addname(args); + error = xfs_attr_node_addname(args); return error; } /* + * Return EEXIST if attr is found, or ENOATTR if not + */ +int +xfs_has_attr( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + int error; + + if (!xfs_inode_hasattr(dp)) + return -ENOATTR; + + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) { + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + return xfs_attr_sf_findname(args, NULL, NULL); + } + + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_hasname(args, &bp); + + if (bp) + xfs_trans_brelse(args->trans, bp); + + return error; + } + + return xfs_attr_node_hasname(args, NULL); +} + +/* * Remove the attribute specified in @args. */ int @@ -370,6 +469,15 @@ xfs_attr_set( args->total, 0, quota_flags); if (error) goto out_trans_cancel; + + error = xfs_has_attr(args); + if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) + goto out_trans_cancel; + if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) + goto out_trans_cancel; + if (error != -ENOATTR && error != -EEXIST) + goto out_trans_cancel; + error = xfs_attr_set_args(args); if (error) goto out_trans_cancel; @@ -377,6 +485,10 @@ xfs_attr_set( if (!args->trans) goto out_unlock; } else { + error = xfs_has_attr(args); + if (error != -EEXIST) + goto out_trans_cancel; + error = xfs_attr_remove_args(args); if (error) goto out_trans_cancel; @@ -459,36 +571,54 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * External routines when attribute list is one block *========================================================================*/ +/* Store info about a remote block */ +STATIC void +xfs_attr_save_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno2 = args->blkno; + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; +} + +/* Set stored info about a remote block */ +STATIC void +xfs_attr_restore_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno = args->blkno2; + args->index = args->index2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; +} + /* - * Add a name to the leaf attribute list structure + * Tries to add an attribute to an inode in leaf form * - * This leaf block cannot have a "remote" value, we only call this routine - * if bmap_one_block() says there is only one block (ie: no remote blks). + * This function is meant to execute as part of a delayed operation and leaves + * the transaction handling to the caller. On success the attribute is added + * and the inode and transaction are left dirty. If there is not enough space, + * the attr data is converted to node format and -ENOSPC is returned. Caller is + * responsible for handling the dirty inode and transaction or adding the attr + * in node format. */ STATIC int -xfs_attr_leaf_addname( - struct xfs_da_args *args) +xfs_attr_leaf_try_add( + struct xfs_da_args *args, + struct xfs_buf *bp) { - struct xfs_inode *dp; - struct xfs_buf *bp; - int retval, error, forkoff; - - trace_xfs_attr_leaf_addname(args); - - /* - * Read the (only) block in the attribute list in. - */ - dp = args->dp; - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); - if (error) - return error; + int retval; /* * Look up the given attribute in the leaf block. Figure out if * the given flags produce an error or call for an atomic rename. */ - retval = xfs_attr3_leaf_lookup_int(bp, args); + retval = xfs_attr_leaf_hasname(args, &bp); + if (retval != -ENOATTR && retval != -EEXIST) + return retval; if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) goto out_brelse; if (retval == -EEXIST) { @@ -499,11 +629,7 @@ xfs_attr_leaf_addname( /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ - args->blkno2 = args->blkno; /* set 2nd entry info*/ - args->index2 = args->index; - args->rmtblkno2 = args->rmtblkno; - args->rmtblkcnt2 = args->rmtblkcnt; - args->rmtvaluelen2 = args->rmtvaluelen; + xfs_attr_save_rmt_blk(args); /* * clear the remote attr state now that it is saved so that the @@ -516,37 +642,35 @@ xfs_attr_leaf_addname( } /* - * Add the attribute to the leaf block, transitioning to a Btree - * if required. + * Add the attribute to the leaf block */ - retval = xfs_attr3_leaf_add(bp, args); - if (retval == -ENOSPC) { - /* - * Promote the attribute list to the Btree format, then - * Commit that transaction so that the node_addname() call - * can manage its own transactions. - */ - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; + return xfs_attr3_leaf_add(bp, args); - /* - * Commit the current trans (including the inode) and start - * a new one. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; +out_brelse: + xfs_trans_brelse(args->trans, bp); + return retval; +} - /* - * Fob the whole rest of the problem off on the Btree code. - */ - error = xfs_attr_node_addname(args); + +/* + * Add a name to the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_addname( + struct xfs_da_args *args) +{ + int error, forkoff; + struct xfs_buf *bp = NULL; + struct xfs_inode *dp = args->dp; + + trace_xfs_attr_leaf_addname(args); + + error = xfs_attr_leaf_try_add(args, bp); + if (error) return error; - } /* * Commit the transaction that added the attr name so that @@ -568,75 +692,93 @@ xfs_attr_leaf_addname( return error; } - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - */ - if (args->op_flags & XFS_DA_OP_RENAME) { + if (!(args->op_flags & XFS_DA_OP_RENAME)) { /* - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. + * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr3_leaf_flipflags(args); + if (args->rmtblkno > 0) + error = xfs_attr3_leaf_clearflag(args); + + return error; + } + + /* + * If this is an atomic rename operation, we must "flip" the incomplete + * flags on the "new" and "old" attribute/value pairs so that one + * disappears and one appears atomically. Then we must remove the "old" + * attribute/value pair. + * + * In a separate transaction, set the incomplete flag on the "old" attr + * and clear the incomplete flag on the "new" attr. + */ + + error = xfs_attr3_leaf_flipflags(args); + if (error) + return error; + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); + if (error) + return error; + + /* + * Dismantle the "old" attribute/value pair by removing a "remote" value + * (if it exists). + */ + xfs_attr_restore_rmt_blk(args); + + if (args->rmtblkno) { + error = xfs_attr_rmtval_invalidate(args); if (error) return error; - /* - * Dismantle the "old" attribute/value pair by removing - * a "remote" value (if it exists). - */ - args->index = args->index2; - args->blkno = args->blkno2; - args->rmtblkno = args->rmtblkno2; - args->rmtblkcnt = args->rmtblkcnt2; - args->rmtvaluelen = args->rmtvaluelen2; - if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } - - /* - * Read in the block containing the "old" attr, then - * remove the "old" attr from that block (neat, huh!) - */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); + error = xfs_attr_rmtval_remove(args); if (error) return error; + } - xfs_attr3_leaf_remove(bp, args); + /* + * Read in the block containing the "old" attr, then remove the "old" + * attr from that block (neat, huh!) + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + &bp); + if (error) + return error; - /* - * If the result is small enough, shrink it all into the inode. - */ - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - } + xfs_attr3_leaf_remove(bp, args); - /* - * Commit the remove and start the next trans in series. - */ - error = xfs_trans_roll_inode(&args->trans, dp); + /* + * If the result is small enough, shrink it all into the inode. + */ + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + + return error; +} + +/* + * Return EEXIST if attr is found, or ENOATTR if not + */ +STATIC int +xfs_attr_leaf_hasname( + struct xfs_da_args *args, + struct xfs_buf **bp) +{ + int error = 0; + + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp); + if (error) + return error; + + error = xfs_attr3_leaf_lookup_int(*bp, args); + if (error != -ENOATTR && error != -EEXIST) + xfs_trans_brelse(args->trans, *bp); - } else if (args->rmtblkno > 0) { - /* - * Added a "remote" value, just clear the incomplete flag. - */ - error = xfs_attr3_leaf_clearflag(args); - } return error; -out_brelse: - xfs_trans_brelse(args->trans, bp); - return retval; } /* @@ -659,31 +801,25 @@ xfs_attr_leaf_removename( * Remove the attribute. */ dp = args->dp; - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); - if (error) - return error; - error = xfs_attr3_leaf_lookup_int(bp, args); + error = xfs_attr_leaf_hasname(args, &bp); + if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); return error; - } + } else if (error != -EEXIST) + return error; xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + return xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - } + return 0; } @@ -703,21 +839,53 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); - if (error) - return error; + error = xfs_attr_leaf_hasname(args, &bp); - error = xfs_attr3_leaf_lookup_int(bp, args); - if (error != -EEXIST) { + if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); return error; - } + } else if (error != -EEXIST) + return error; + + error = xfs_attr3_leaf_getvalue(bp, args); xfs_trans_brelse(args->trans, bp); return error; } +/* + * Return EEXIST if attr is found, or ENOATTR if not + * statep: If not null is set to point at the found state. Caller will + * be responsible for freeing the state in this case. + */ +STATIC int +xfs_attr_node_hasname( + struct xfs_da_args *args, + struct xfs_da_state **statep) +{ + struct xfs_da_state *state; + int retval, error; + + state = xfs_da_state_alloc(args); + if (statep != NULL) + *statep = NULL; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da3_node_lookup_int(state, &retval); + if (error) { + xfs_da_state_free(state); + return error; + } + + if (statep != NULL) + *statep = state; + else + xfs_da_state_free(state); + return retval; +} + /*======================================================================== * External routines when attribute list size > geo->blksize *========================================================================*/ @@ -739,7 +907,6 @@ xfs_attr_node_addname( struct xfs_da_state *state; struct xfs_da_state_blk *blk; struct xfs_inode *dp; - struct xfs_mount *mp; int retval, error; trace_xfs_attr_node_addname(args); @@ -748,19 +915,15 @@ xfs_attr_node_addname( * Fill in bucket of arguments/results/context to carry around. */ dp = args->dp; - mp = dp->i_mount; restart: - state = xfs_da_state_alloc(); - state->args = args; - state->mp = mp; - /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_da3_node_lookup_int(state, &retval); - if (error) + retval = xfs_attr_node_hasname(args, &state); + if (retval != -ENOATTR && retval != -EEXIST) goto out; + blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) @@ -773,11 +936,7 @@ restart: /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ - args->blkno2 = args->blkno; /* set 2nd entry info*/ - args->index2 = args->index; - args->rmtblkno2 = args->rmtblkno; - args->rmtblkcnt2 = args->rmtblkcnt; - args->rmtvaluelen2 = args->rmtvaluelen; + xfs_attr_save_rmt_blk(args); /* * clear the remote attr state now that it is saved so that the @@ -863,82 +1022,75 @@ restart: return error; } - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - */ - if (args->op_flags & XFS_DA_OP_RENAME) { + if (!(args->op_flags & XFS_DA_OP_RENAME)) { /* - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. + * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - goto out; + if (args->rmtblkno > 0) + error = xfs_attr3_leaf_clearflag(args); + retval = error; + goto out; + } - /* - * Dismantle the "old" attribute/value pair by removing - * a "remote" value (if it exists). - */ - args->index = args->index2; - args->blkno = args->blkno2; - args->rmtblkno = args->rmtblkno2; - args->rmtblkcnt = args->rmtblkcnt2; - args->rmtvaluelen = args->rmtvaluelen2; - if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } + /* + * If this is an atomic rename operation, we must "flip" the incomplete + * flags on the "new" and "old" attribute/value pairs so that one + * disappears and one appears atomically. Then we must remove the "old" + * attribute/value pair. + * + * In a separate transaction, set the incomplete flag on the "old" attr + * and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr3_leaf_flipflags(args); + if (error) + goto out; + /* + * Commit the flag value change and start the next trans in series + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); + if (error) + goto out; - /* - * Re-find the "old" attribute entry after any split ops. - * The INCOMPLETE flag means that we will find the "old" - * attr, not the "new" one. - */ - args->attr_filter |= XFS_ATTR_INCOMPLETE; - state = xfs_da_state_alloc(); - state->args = args; - state->mp = mp; - state->inleaf = 0; - error = xfs_da3_node_lookup_int(state, &retval); + /* + * Dismantle the "old" attribute/value pair by removing a "remote" value + * (if it exists). + */ + xfs_attr_restore_rmt_blk(args); + + if (args->rmtblkno) { + error = xfs_attr_rmtval_invalidate(args); if (error) - goto out; + return error; - /* - * Remove the name and update the hashvals in the tree. - */ - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr3_leaf_remove(blk->bp, args); - xfs_da3_fixhashpath(state, &state->path); + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + } - /* - * Check to see if the tree needs to be collapsed. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; - } + /* + * Re-find the "old" attribute entry after any split ops. The INCOMPLETE + * flag means that we will find the "old" attr, not the "new" one. + */ + args->attr_filter |= XFS_ATTR_INCOMPLETE; + state = xfs_da_state_alloc(args); + state->inleaf = 0; + error = xfs_da3_node_lookup_int(state, &retval); + if (error) + goto out; - /* - * Commit and start the next trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); - } else if (args->rmtblkno > 0) { - /* - * Added a "remote" value, just clear the incomplete flag. - */ - error = xfs_attr3_leaf_clearflag(args); + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + error = xfs_da3_join(state); if (error) goto out; } @@ -953,6 +1105,114 @@ out: } /* + * Shrink an attribute from leaf to shortform + */ +STATIC int +xfs_attr_node_shrink( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + struct xfs_inode *dp = args->dp; + int error, forkoff; + struct xfs_buf *bp; + + /* + * Have to get rid of the copy of this dabuf in the state. + */ + ASSERT(state->path.active == 1); + ASSERT(state->path.blk[0].bp); + state->path.blk[0].bp = NULL; + + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; + + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) { + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + } else + xfs_trans_brelse(args->trans, bp); + + return error; +} + +/* + * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers + * for later deletion of the entry. + */ +STATIC int +xfs_attr_leaf_mark_incomplete( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error; + + /* + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. + */ + error = xfs_attr_fillstate(state); + if (error) + return error; + + /* + * Mark the attribute as INCOMPLETE + */ + return xfs_attr3_leaf_setflag(args); +} + +/* + * Initial setup for xfs_attr_node_removename. Make sure the attr is there and + * the blocks are valid. Attr keys with remote blocks will be marked + * incomplete. + */ +STATIC +int xfs_attr_node_removename_setup( + struct xfs_da_args *args, + struct xfs_da_state **state) +{ + int error; + + error = xfs_attr_node_hasname(args, state); + if (error != -EEXIST) + return error; + + ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); + ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == + XFS_ATTR_LEAF_MAGIC); + + if (args->rmtblkno > 0) { + error = xfs_attr_leaf_mark_incomplete(args, *state); + if (error) + return error; + + return xfs_attr_rmtval_invalidate(args); + } + + return 0; +} + +STATIC int +xfs_attr_node_remove_rmt( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error = 0; + + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + + /* + * Refill the state structure with buffers, the prior calls released our + * buffers. + */ + return xfs_attr_refillstate(state); +} + +/* * Remove a name from a B-tree attribute list. * * This will involve walking down the Btree, and may involve joining @@ -965,64 +1225,22 @@ xfs_attr_node_removename( { struct xfs_da_state *state; struct xfs_da_state_blk *blk; - struct xfs_inode *dp; - struct xfs_buf *bp; - int retval, error, forkoff; + int retval, error; + struct xfs_inode *dp = args->dp; trace_xfs_attr_node_removename(args); - /* - * Tie a string around our finger to remind us where we are. - */ - dp = args->dp; - state = xfs_da_state_alloc(); - state->args = args; - state->mp = dp->i_mount; - - /* - * Search to see if name exists, and get back a pointer to it. - */ - error = xfs_da3_node_lookup_int(state, &retval); - if (error || (retval != -EEXIST)) { - if (error == 0) - error = retval; + error = xfs_attr_node_removename_setup(args, &state); + if (error) goto out; - } /* * If there is an out-of-line value, de-allocate the blocks. * This is done before we remove the attribute so that we don't * overflow the maximum size of a transaction and/or hit a deadlock. */ - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->bp != NULL); - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); if (args->rmtblkno > 0) { - /* - * Fill in disk block numbers in the state structure - * so that we can get the buffers back after we commit - * several transactions in the following calls. - */ - error = xfs_attr_fillstate(state); - if (error) - goto out; - - /* - * Mark the attribute as INCOMPLETE, then bunmapi() the - * remote value. - */ - error = xfs_attr3_leaf_setflag(args); - if (error) - goto out; - error = xfs_attr_rmtval_remove(args); - if (error) - goto out; - - /* - * Refill the state structure with buffers, the prior calls - * released our buffers. - */ - error = xfs_attr_refillstate(state); + error = xfs_attr_node_remove_rmt(args, state); if (error) goto out; } @@ -1056,33 +1274,12 @@ xfs_attr_node_removename( /* * If the result is small enough, push it all into the inode. */ - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - /* - * Have to get rid of the copy of this dabuf in the state. - */ - ASSERT(state->path.active == 1); - ASSERT(state->path.blk[0].bp); - state->path.blk[0].bp = NULL; - - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - goto out; - - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - if (error) - goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; - } else - xfs_trans_brelse(args->trans, bp); - } - error = 0; + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + error = xfs_attr_node_shrink(args, state); out: - xfs_da_state_free(state); + if (state) + xfs_da_state_free(state); return error; } @@ -1198,47 +1395,41 @@ xfs_attr_refillstate(xfs_da_state_t *state) * Returns 0 on successful retrieval, otherwise an error. */ STATIC int -xfs_attr_node_get(xfs_da_args_t *args) +xfs_attr_node_get( + struct xfs_da_args *args) { - xfs_da_state_t *state; - xfs_da_state_blk_t *blk; - int error, retval; - int i; + struct xfs_da_state *state; + struct xfs_da_state_blk *blk; + int i; + int error; trace_xfs_attr_node_get(args); - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; - /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da3_node_lookup_int(state, &retval); - if (error) { - retval = error; - goto out_release; - } - if (retval != -EEXIST) + error = xfs_attr_node_hasname(args, &state); + if (error != -EEXIST) goto out_release; /* * Get the value, local or "remote" */ blk = &state->path.blk[state->path.active - 1]; - retval = xfs_attr3_leaf_getvalue(blk->bp, args); + error = xfs_attr3_leaf_getvalue(blk->bp, args); /* * If not in a transaction, we have to release all the buffers. */ out_release: - for (i = 0; i < state->path.active; i++) { + for (i = 0; state != NULL && i < state->path.active; i++) { xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } - xfs_da_state_free(state); - return retval; + if (state) + xfs_da_state_free(state); + return error; } /* Returns true if the attribute entry name is valid. */ diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index db4717657ca1..3e97a935e712 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -89,6 +89,7 @@ int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_args(struct xfs_da_args *args); +int xfs_has_attr(struct xfs_da_args *args); int xfs_attr_remove_args(struct xfs_da_args *args); bool xfs_attr_namecheck(const void *name, size_t length); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2f7e89e4be3e..8623c815164a 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -660,18 +660,65 @@ xfs_attr_shortform_create( } /* + * Return -EEXIST if attr is found, or -ENOATTR if not + * args: args containing attribute name and namelen + * sfep: If not null, pointer will be set to the last attr entry found on + -EEXIST. On -ENOATTR pointer is left at the last entry in the list + * basep: If not null, pointer is set to the byte offset of the entry in the + * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of + * the last entry in the list + */ +int +xfs_attr_sf_findname( + struct xfs_da_args *args, + struct xfs_attr_sf_entry **sfep, + unsigned int *basep) +{ + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + unsigned int base = sizeof(struct xfs_attr_sf_hdr); + int size = 0; + int end; + int i; + + sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + end = sf->hdr.count; + for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + base += size, i++) { + size = XFS_ATTR_SF_ENTSIZE(sfe); + if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + continue; + break; + } + + if (sfep != NULL) + *sfep = sfe; + + if (basep != NULL) + *basep = base; + + if (i == end) + return -ENOATTR; + return -EEXIST; +} + +/* * Add a name/value pair to the shortform attribute list. * Overflow from the inode has already been checked for. */ void -xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) +xfs_attr_shortform_add( + struct xfs_da_args *args, + int forkoff) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - int i, offset, size; - xfs_mount_t *mp; - xfs_inode_t *dp; - struct xfs_ifork *ifp; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int offset, size; + struct xfs_mount *mp; + struct xfs_inode *dp; + struct xfs_ifork *ifp; trace_xfs_attr_sf_add(args); @@ -682,11 +729,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - ASSERT(!xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)); - } + if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) + ASSERT(0); offset = (char *)sfe - (char *)sf; size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); @@ -728,31 +772,27 @@ xfs_attr_fork_remove( * Remove an attribute from the shortform attribute list structure. */ int -xfs_attr_shortform_remove(xfs_da_args_t *args) +xfs_attr_shortform_remove( + struct xfs_da_args *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - int base, size=0, end, totsize, i; - xfs_mount_t *mp; - xfs_inode_t *dp; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int size = 0, end, totsize; + unsigned int base; + struct xfs_mount *mp; + struct xfs_inode *dp; + int error; trace_xfs_attr_sf_remove(args); dp = args->dp; mp = dp->i_mount; - base = sizeof(xfs_attr_sf_hdr_t); sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; - sfe = &sf->list[0]; - end = sf->hdr.count; - for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), - base += size, i++) { - size = XFS_ATTR_SF_ENTSIZE(sfe); - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - break; - } - if (i == end) - return -ENOATTR; + + error = xfs_attr_sf_findname(args, &sfe, &base); + if (error != -EEXIST) + return error; + size = XFS_ATTR_SF_ENTSIZE(sfe); /* * Fix up the attribute fork data, covering the hole @@ -2742,10 +2782,7 @@ xfs_attr3_leaf_clearflag( XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - return xfs_trans_roll_inode(&args->trans, args->dp); + return 0; } /* @@ -2793,10 +2830,7 @@ xfs_attr3_leaf_setflag( XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - return xfs_trans_roll_inode(&args->trans, args->dp); + return 0; } /* @@ -2911,10 +2945,5 @@ xfs_attr3_leaf_flipflags( XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - - return error; + return 0; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 5be6be309302..9b1c59f40a26 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -52,6 +52,9 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, struct xfs_buf **leaf_bp); int xfs_attr_shortform_remove(struct xfs_da_args *args); +int xfs_attr_sf_findname(struct xfs_da_args *args, + struct xfs_attr_sf_entry **sfep, + unsigned int *basep); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 01ad7f353e08..3f80cede7406 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -440,32 +440,23 @@ xfs_attr_rmtval_get( } /* - * Write the value associated with an attribute into the out-of-line buffer - * that we have defined for it. + * Find a "hole" in the attribute address space large enough for us to drop the + * new attribute's value into */ -int -xfs_attr_rmtval_set( +STATIC int +xfs_attr_rmt_find_hole( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_bmbt_irec map; - xfs_dablk_t lblkno; - xfs_fileoff_t lfileoff = 0; - uint8_t *src = args->value; - int blkcnt; - int valuelen; - int nmap; int error; - int offset = 0; - - trace_xfs_attr_rmtval_set(args); + int blkcnt; + xfs_fileoff_t lfileoff = 0; /* - * Find a "hole" in the attribute address space large enough for - * us to drop the new attribute's value into. Because CRC enable - * attributes have headers, we can't just do a straight byte to FSB - * conversion and have to take the header space into account. + * Because CRC enable attributes have headers, we can't just do a + * straight byte to FSB conversion and have to take the header space + * into account. */ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, @@ -473,48 +464,26 @@ xfs_attr_rmtval_set( if (error) return error; - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkno = (xfs_dablk_t)lfileoff; args->rmtblkcnt = blkcnt; - /* - * Roll through the "value", allocating blocks on disk as required. - */ - while (blkcnt > 0) { - /* - * Allocate a single extent, up to the size of the value. - * - * Note that we have to consider this a data allocation as we - * write the remote attribute without logging the contents. - * Hence we must ensure that we aren't using blocks that are on - * the busy list so that we don't overwrite blocks which have - * recently been freed but their transactions are not yet - * committed to disk. If we overwrite the contents of a busy - * extent and then crash then the block may not contain the - * correct metadata after log recovery occurs. - */ - nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, - &nmap); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - lblkno += map.br_blockcount; - blkcnt -= map.br_blockcount; + return 0; +} - /* - * Start the next trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; - } +STATIC int +xfs_attr_rmtval_set_value( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + uint8_t *src = args->value; + int blkcnt; + int valuelen; + int nmap; + int error; + int offset = 0; /* * Roll through the "value", copying the attribute value to the @@ -595,19 +564,82 @@ xfs_attr_rmtval_stale( } /* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +int +xfs_attr_rmtval_set( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + int blkcnt; + int nmap; + int error; + + trace_xfs_attr_rmtval_set(args); + + error = xfs_attr_rmt_find_hole(args); + if (error) + return error; + + blkcnt = args->rmtblkcnt; + lblkno = (xfs_dablk_t)args->rmtblkno; + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + /* + * Allocate a single extent, up to the size of the value. + * + * Note that we have to consider this a data allocation as we + * write the remote attribute without logging the contents. + * Hence we must ensure that we aren't using blocks that are on + * the busy list so that we don't overwrite blocks which have + * recently been freed but their transactions are not yet + * committed to disk. If we overwrite the contents of a busy + * extent and then crash then the block may not contain the + * correct metadata after log recovery occurs. + */ + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, + &nmap); + if (error) + return error; + error = xfs_defer_finish(&args->trans); + if (error) + return error; + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + + /* + * Start the next trans in the chain. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) + return error; + } + + return xfs_attr_rmtval_set_value(args); +} + +/* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. */ int -xfs_attr_rmtval_remove( +xfs_attr_rmtval_invalidate( struct xfs_da_args *args) { xfs_dablk_t lblkno; int blkcnt; int error; - int done; - - trace_xfs_attr_rmtval_remove(args); /* * Roll through the "value", invalidating the attribute value's blocks. @@ -635,21 +667,29 @@ xfs_attr_rmtval_remove( lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; } + return 0; +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +int +xfs_attr_rmtval_remove( + struct xfs_da_args *args) +{ + int error; + int retval; + + trace_xfs_attr_rmtval_remove(args); /* * Keep de-allocating extents until the remote-value region is gone. */ - lblkno = args->rmtblkno; - blkcnt = args->rmtblkcnt; - done = 0; - while (!done) { - error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK, 1, &done); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; + do { + retval = __xfs_attr_rmtval_remove(args); + if (retval && retval != -EAGAIN) + return retval; /* * Close out trans and start the next one in the chain. @@ -657,6 +697,36 @@ xfs_attr_rmtval_remove( error = xfs_trans_roll_inode(&args->trans, args->dp); if (error) return error; - } + } while (retval == -EAGAIN); + return 0; } + +/* + * Remove the value associated with an attribute by deleting the out-of-line + * buffer that it is stored on. Returns EAGAIN for the caller to refresh the + * transaction and re-call the function + */ +int +__xfs_attr_rmtval_remove( + struct xfs_da_args *args) +{ + int error, done; + + /* + * Unmap value blocks for this attr. + */ + error = xfs_bunmapi(args->trans, args->dp, args->rmtblkno, + args->rmtblkcnt, XFS_BMAPI_ATTRFORK, 1, &done); + if (error) + return error; + + error = xfs_defer_finish(&args->trans); + if (error) + return error; + + if (!done) + return -EAGAIN; + + return error; +} diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index e1144f22b005..9eee615da156 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -13,5 +13,6 @@ int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); - +int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); +int __xfs_attr_rmtval_remove(struct xfs_da_args *args); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 667cdd0dfdf4..9c40d5971035 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -553,7 +553,8 @@ __xfs_bmap_add_free( #endif ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); + new = kmem_cache_alloc(xfs_bmap_free_item_zone, + GFP_KERNEL | __GFP_NOFAIL); new->xefi_startblock = bno; new->xefi_blockcount = (xfs_extlen_t)len; if (oinfo) @@ -1098,7 +1099,10 @@ xfs_bmap_add_attrfork( if (error) goto trans_cancel; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); + + ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, + GFP_KERNEL | __GFP_NOFAIL); + ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 6028a3c825ba..e1bd484e5548 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -158,17 +158,22 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) { BMAP_ATTRFORK, "ATTR" }, \ { BMAP_COWFORK, "COW" } +/* Return true if the extent is an allocated extent, written or not. */ +static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +{ + return irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !isnullstartblock(irec->br_startblock); +} /* * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ -static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) { - return irec->br_state != XFS_EXT_UNWRITTEN && - irec->br_startblock != HOLESTARTBLOCK && - irec->br_startblock != DELAYSTARTBLOCK && - !isnullstartblock(irec->br_startblock); + return xfs_bmap_is_real_extent(irec) && + irec->br_state != XFS_EXT_UNWRITTEN; } /* diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index d9c63f17d2de..ecec604e6e4d 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -552,7 +552,7 @@ xfs_bmbt_init_cursor( struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index 643f0f9b2994..f0d2976050ae 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -18,7 +18,7 @@ struct xbtree_afakeroot { unsigned int af_blocks; }; -/* Cursor interactions with with fake roots for AG-rooted btrees. */ +/* Cursor interactions with fake roots for AG-rooted btrees. */ void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, struct xbtree_afakeroot *afake); void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, @@ -45,7 +45,7 @@ struct xbtree_ifakeroot { unsigned int if_extents; }; -/* Cursor interactions with with fake roots for inode-rooted btrees. */ +/* Cursor interactions with fake roots for inode-rooted btrees. */ void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur, struct xbtree_ifakeroot *ifake, struct xfs_btree_ops **new_ops); @@ -90,7 +90,7 @@ struct xfs_btree_bload { /* * Number of free records to leave in each leaf block. If the caller - * sets this to -1, the slack value will be calculated to be be halfway + * sets this to -1, the slack value will be calculated to be halfway * between maxrecs and minrecs. This typically leaves the block 75% * full. Note that slack values are not enforced on inode root blocks. */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 897749c41f36..e46bc03365db 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -78,10 +78,16 @@ kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ * Allocate a dir-state structure. * We don't put them on the stack since they're large. */ -xfs_da_state_t * -xfs_da_state_alloc(void) +struct xfs_da_state * +xfs_da_state_alloc( + struct xfs_da_args *args) { - return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); + struct xfs_da_state *state; + + state = kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL); + state->args = args; + state->mp = args->dp->i_mount; + return state; } /* diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 6e25de6621e4..ad5dd324631a 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -219,7 +219,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, const unsigned char *name, int len); -xfs_da_state_t *xfs_da_state_alloc(void); +struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args); void xfs_da_state_free(xfs_da_state_t *state); void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 6ac4aad98cd7..5d51265d29d6 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -2015,9 +2015,7 @@ xfs_dir2_node_addname( /* * Allocate and initialize the state (btree cursor). */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* * Look up the name. We're not supposed to find it, but * this gives us the insertion point. @@ -2086,9 +2084,8 @@ xfs_dir2_node_lookup( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); + /* * Fill in the path to the entry in the cursor. */ @@ -2139,9 +2136,7 @@ xfs_dir2_node_removename( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* Look up the entry we're deleting, set up the cursor. */ error = xfs_da3_node_lookup_int(state, &rval); @@ -2206,9 +2201,7 @@ xfs_dir2_node_replace( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* * We have to save new inode number and ftype since diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index bedc1e752b60..5a2db00b9d5f 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -37,9 +37,10 @@ xfs_failaddr_t xfs_dquot_verify( struct xfs_mount *mp, struct xfs_disk_dquot *ddq, - xfs_dqid_t id, - uint type) /* used only during quotacheck */ + xfs_dqid_t id) /* used only during quotacheck */ { + __u8 ddq_type; + /* * We can encounter an uninitialized dquot buffer for 2 reasons: * 1. If we crash while deleting the quotainode(s), and those blks got @@ -60,11 +61,12 @@ xfs_dquot_verify( if (ddq->d_version != XFS_DQUOT_VERSION) return __this_address; - if (type && ddq->d_flags != type) + if (ddq->d_type & ~XFS_DQTYPE_ANY) return __this_address; - if (ddq->d_flags != XFS_DQ_USER && - ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) + ddq_type = ddq->d_type & XFS_DQTYPE_REC_MASK; + if (ddq_type != XFS_DQTYPE_USER && + ddq_type != XFS_DQTYPE_PROJ && + ddq_type != XFS_DQTYPE_GROUP) return __this_address; if (id != -1 && id != be32_to_cpu(ddq->d_id)) @@ -95,14 +97,13 @@ xfs_failaddr_t xfs_dqblk_verify( struct xfs_mount *mp, struct xfs_dqblk *dqb, - xfs_dqid_t id, - uint type) /* used only during quotacheck */ + xfs_dqid_t id) /* used only during quotacheck */ { if (xfs_sb_version_hascrc(&mp->m_sb) && !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type); + return xfs_dquot_verify(mp, &dqb->dd_diskdq, id); } /* @@ -113,7 +114,7 @@ xfs_dqblk_repair( struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, - uint type) + xfs_dqtype_t type) { /* * Typically, a repair is only requested by quotacheck. @@ -123,7 +124,7 @@ xfs_dqblk_repair( dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION; - dqb->dd_diskdq.d_flags = type; + dqb->dd_diskdq.d_type = type; dqb->dd_diskdq.d_id = cpu_to_be32(id); if (xfs_sb_version_hascrc(&mp->m_sb)) { @@ -205,7 +206,7 @@ xfs_dquot_buf_verify( if (i == 0) id = be32_to_cpu(ddq->d_id); - fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0); + fa = xfs_dqblk_verify(mp, &dqb[i], id + i); if (fa) { if (!readahead) xfs_buf_verifier_error(bp, -EFSCORRUPTED, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index b42a52bfa1e9..31b7ece985bb 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1149,16 +1149,26 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ #define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ +#define XFS_DQTYPE_USER 0x01 /* user dquot record */ +#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ +#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ + +/* bitmask to determine if this is a user/group/project dquot */ +#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ + XFS_DQTYPE_PROJ | \ + XFS_DQTYPE_GROUP) + +#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK) + /* - * This is the main portion of the on-disk representation of quota - * information for a user. This is the q_core of the struct xfs_dquot that - * is kept in kernel memory. We pad this with some more expansion room - * to construct the on disk structure. + * This is the main portion of the on-disk representation of quota information + * for a user. We pad this with some more expansion room to construct the on + * disk structure. */ struct xfs_disk_dquot { __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ __u8 d_version; /* dquot version */ - __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ + __u8 d_type; /* XFS_DQTYPE_USER/PROJ/GROUP */ __be32 d_id; /* user,project,group id */ __be64 d_blk_hardlimit;/* absolute limit on disk blks */ __be64 d_blk_softlimit;/* preferred limit on disk blks */ @@ -1199,6 +1209,22 @@ typedef struct xfs_dqblk { #define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) /* + * This defines the unit of allocation of dquots. + * + * Currently, it is just one file system block, and a 4K blk contains 30 + * (136 * 30 = 4080) dquots. It's probably not worth trying to make + * this more dynamic. + * + * However, if this number is changed, we have to make sure that we don't + * implicitly assume that we do allocations in chunks of a single filesystem + * block in the dquot/xqm code. + * + * This is part of the ondisk format because the structure size is not a power + * of two, which leaves slack at the end of the disk block. + */ +#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 + +/* * Remote symlink format and access functions. */ #define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 7fcf62b324b0..f742a96a2fe1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -888,10 +888,9 @@ sparse_alloc: */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); - pag = xfs_perag_get(args.mp, agno); + pag = agbp->b_pag; pag->pagi_freecount += newlen; pag->pagi_count += newlen; - xfs_perag_put(pag); agi->agi_newino = cpu_to_be32(newino); /* @@ -1134,7 +1133,7 @@ xfs_dialloc_ag_inobt( xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); - struct xfs_perag *pag; + struct xfs_perag *pag = agbp->b_pag; struct xfs_btree_cur *cur, *tcur; struct xfs_inobt_rec_incore rec, trec; xfs_ino_t ino; @@ -1143,8 +1142,6 @@ xfs_dialloc_ag_inobt( int i, j; int searchdistance = 10; - pag = xfs_perag_get(mp, agno); - ASSERT(pag->pagi_init); ASSERT(pag->pagi_inodeok); ASSERT(pag->pagi_freecount > 0); @@ -1384,14 +1381,12 @@ alloc_inode: xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); - xfs_perag_put(pag); *inop = ino; return 0; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_perag_put(pag); return error; } @@ -1587,7 +1582,6 @@ xfs_dialloc_ag( xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); - struct xfs_perag *pag; struct xfs_btree_cur *cur; /* finobt cursor */ struct xfs_btree_cur *icur; /* inobt cursor */ struct xfs_inobt_rec_incore rec; @@ -1599,8 +1593,6 @@ xfs_dialloc_ag( if (!xfs_sb_version_hasfinobt(&mp->m_sb)) return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); - pag = xfs_perag_get(mp, agno); - /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1667,7 +1659,7 @@ xfs_dialloc_ag( */ be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - pag->pagi_freecount--; + agbp->b_pag->pagi_freecount--; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); @@ -1680,7 +1672,6 @@ xfs_dialloc_ag( xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_perag_put(pag); *inop = ino; return 0; @@ -1688,7 +1679,6 @@ error_icur: xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); error_cur: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_perag_put(pag); return error; } @@ -1945,7 +1935,6 @@ xfs_difree_inobt( { struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); - struct xfs_perag *pag; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int ilen; @@ -2007,6 +1996,8 @@ xfs_difree_inobt( if (!(mp->m_flags & XFS_MOUNT_IKEEP) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { + struct xfs_perag *pag = agbp->b_pag; + xic->deleted = true; xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); @@ -2020,10 +2011,8 @@ xfs_difree_inobt( be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); - pag = xfs_perag_get(mp, agno); pag->pagi_freecount -= ilen - 1; pag->pagi_count -= ilen; - xfs_perag_put(pag); xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); @@ -2049,9 +2038,7 @@ xfs_difree_inobt( */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - pag = xfs_perag_get(mp, agno); - pag->pagi_freecount++; - xfs_perag_put(pag); + agbp->b_pag->pagi_freecount++; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } @@ -2661,7 +2648,7 @@ xfs_ialloc_read_agi( return error; agi = (*bpp)->b_addr; - pag = xfs_perag_get(mp, agno); + pag = (*bpp)->b_pag; if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); @@ -2674,7 +2661,6 @@ xfs_ialloc_read_agi( */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || XFS_FORCED_SHUTDOWN(mp)); - xfs_perag_put(pag); return 0; } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index b2c122ad8f0e..3c8aebc36e64 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -411,7 +411,7 @@ xfs_inobt_init_common( { struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = btnum; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 6f84ea85fdd8..8d5dd08eab75 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -21,30 +21,6 @@ #include <linux/iversion.h> /* - * Check that none of the inode's in the buffer have a next - * unlinked field of 0. - */ -#if defined(DEBUG) -void -xfs_inobp_check( - xfs_mount_t *mp, - xfs_buf_t *bp) -{ - int i; - xfs_dinode_t *dip; - - for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { - dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); - if (!dip->di_next_unlinked) { - xfs_alert(mp, - "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", - i, (long long)bp->b_bn); - } - } -} -#endif - -/* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence * has not had the inode cores stamped into it. Hence for readahead, the buffer @@ -53,10 +29,10 @@ xfs_inobp_check( * If the readahead buffer is invalid, we need to mark it with an error and * clear the DONE status of the buffer so that a followup read will re-read it * from disk. We don't report the error otherwise to avoid warnings during log - * recovery and we don't get unnecssary panics on debug kernels. We use EIO here + * recovery and we don't get unnecessary panics on debug kernels. We use EIO here * because all we want to do is say readahead failed; there is no-one to report * the error to, so this will distinguish it from a non-ra verifier failure. - * Changes to this readahead error behavour also need to be reflected in + * Changes to this readahead error behaviour also need to be reflected in * xfs_dquot_buf_readahead_verify(). */ static void @@ -176,7 +152,8 @@ xfs_imap_to_bp( } *bpp = bp; - *dipp = xfs_buf_offset(bp, imap->im_boffset); + if (dipp) + *dipp = xfs_buf_offset(bp, imap->im_boffset); return 0; } @@ -203,7 +180,7 @@ xfs_inode_from_disk( /* * First get the permanent information that is needed to allocate an * inode. If the inode is unused, mode is zero and we shouldn't mess - * with the unitialized part of it. + * with the uninitialized part of it. */ to->di_flushiter = be16_to_cpu(from->di_flushiter); inode->i_generation = be32_to_cpu(from->di_gen); diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 865ac493c72a..6b08b9d060c2 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -52,12 +52,6 @@ int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); -#if defined(DEBUG) -void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); -#else -#define xfs_inobp_check(mp, bp) -#endif /* DEBUG */ - xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 28b366275ae0..0cf853d42d62 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -291,7 +291,7 @@ xfs_iformat_attr_fork( * Initialize the extent count early, as the per-format routines may * depend on it. */ - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); + ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL); ip->i_afp->if_format = dip->di_aformat; if (unlikely(ip->i_afp->if_format == 0)) /* pre IRIX 6.2 file system */ ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; @@ -673,8 +673,8 @@ xfs_ifork_init_cow( if (ip->i_cowfp) return; - ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, - KM_NOFS); + ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_zone, + GFP_NOFS | __GFP_NOFAIL); ip->i_cowfp->if_flags = XFS_IFEXTENTS; ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 56d9dd787e7b..076bdc7037ee 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -18,23 +18,22 @@ typedef uint64_t xfs_qcnt_t; typedef uint16_t xfs_qwarncnt_t; +typedef uint8_t xfs_dqtype_t; + +#define XFS_DQTYPE_STRINGS \ + { XFS_DQTYPE_USER, "USER" }, \ + { XFS_DQTYPE_PROJ, "PROJ" }, \ + { XFS_DQTYPE_GROUP, "GROUP" } + /* * flags for q_flags field in the dquot. */ -#define XFS_DQ_USER 0x0001 /* a user quota */ -#define XFS_DQ_PROJ 0x0002 /* project quota */ -#define XFS_DQ_GROUP 0x0004 /* a group quota */ -#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQ_FREEING 0x0010 /* dquot is being torn down */ - -#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) +#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */ +#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */ -#define XFS_DQ_FLAGS \ - { XFS_DQ_USER, "USER" }, \ - { XFS_DQ_PROJ, "PROJ" }, \ - { XFS_DQ_GROUP, "GROUP" }, \ - { XFS_DQ_DIRTY, "DIRTY" }, \ - { XFS_DQ_FREEING, "FREEING" } +#define XFS_DQFLAG_STRINGS \ + { XFS_DQFLAG_DIRTY, "DIRTY" }, \ + { XFS_DQFLAG_FREEING, "FREEING" } /* * We have the possibility of all three quota types being active at once, and @@ -137,11 +136,11 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, - struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type); + struct xfs_disk_dquot *ddq, xfs_dqid_t id); extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, - struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); + struct xfs_dqblk *dqb, xfs_dqid_t id); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, - xfs_dqid_t id, uint type); + xfs_dqid_t id, xfs_dqtype_t type); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 7fd6044a4f78..a6ac60ae9421 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -37,15 +37,13 @@ xfs_refcountbt_set_root( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_refcount_root = ptr->s; be32_add_cpu(&agf->agf_refcount_level, inc); pag->pagf_refcount_level += inc; - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); @@ -325,7 +323,7 @@ xfs_refcountbt_init_common( ASSERT(agno != NULLAGNUMBER); ASSERT(agno < mp->m_sb.sb_agcount); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = XFS_BTNUM_REFC; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index b7c05314d07c..beb81c84a937 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -63,16 +63,14 @@ xfs_rmapbt_set_root( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); pag->pagf_levels[btnum] += inc; - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -123,6 +121,7 @@ xfs_rmapbt_free_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag; xfs_agblock_t bno; int error; @@ -139,8 +138,8 @@ xfs_rmapbt_free_block( XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); - xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_ag.agno); - + pag = cur->bc_ag.agbp->b_pag; + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); return 0; } @@ -457,7 +456,7 @@ xfs_rmapbt_init_common( { struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; /* Overlapping btree; 2 keys per pointer. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 9498ced947be..1d9fa8a300f1 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -70,7 +70,7 @@ xfs_rtbuf_get( if (error) return error; - if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_real_extent(&map))) + if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) return -EFSCORRUPTED; ASSERT(map.br_startblock != NULLFSBLOCK); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c45acbd3add9..708feb8eac76 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -65,6 +65,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ #define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ +#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index b5dfb6654842..e15129647e00 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -36,6 +38,7 @@ xfs_trans_ijoin( ASSERT(iip->ili_lock_flags == 0); iip->ili_lock_flags = lock_flags; + ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); /* * Get a log_item_desc to point at the new item. @@ -71,24 +74,35 @@ xfs_trans_ichgtime( } /* - * This is called to mark the fields indicated in fieldmask as needing - * to be logged when the transaction is committed. The inode must - * already be associated with the given transaction. + * This is called to mark the fields indicated in fieldmask as needing to be + * logged when the transaction is committed. The inode must already be + * associated with the given transaction. * - * The values for fieldmask are defined in xfs_inode_item.h. We always - * log all of the core inode if any of it has changed, and we always log - * all of the inline data/extents/b-tree root if any of them has changed. + * The values for fieldmask are defined in xfs_inode_item.h. We always log all + * of the core inode if any of it has changed, and we always log all of the + * inline data/extents/b-tree root if any of them has changed. + * + * Grab and pin the cluster buffer associated with this inode to avoid RMW + * cycles at inode writeback time. Avoid the need to add error handling to every + * xfs_trans_log_inode() call by shutting down on read error. This will cause + * transactions to fail and everything to error out, just like if we return a + * read error in a dirty transaction and cancel it. */ void xfs_trans_log_inode( - xfs_trans_t *tp, - xfs_inode_t *ip, - uint flags) + struct xfs_trans *tp, + struct xfs_inode *ip, + uint flags) { - struct inode *inode = VFS_I(ip); + struct xfs_inode_log_item *iip = ip->i_itemp; + struct inode *inode = VFS_I(ip); + uint iversion_flags = 0; - ASSERT(ip->i_itemp != NULL); + ASSERT(iip); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); + + tp->t_flags |= XFS_TRANS_DIRTY; /* * Don't bother with i_lock for the I_DIRTY_TIME check here, as races @@ -103,15 +117,6 @@ xfs_trans_log_inode( } /* - * Record the specific change for fdatasync optimisation. This - * allows fdatasync to skip log forces for inodes that are only - * timestamp dirty. We do this before the change count so that - * the core being logged in this case does not impact on fdatasync - * behaviour. - */ - ip->i_itemp->ili_fsync_fields |= flags; - - /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. While we have the * inode locked exclusively for metadata modification, we can usually @@ -120,23 +125,64 @@ xfs_trans_log_inode( * set however, then go ahead and bump the i_version counter * unconditionally. */ - if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) && - IS_I_VERSION(VFS_I(ip))) { - if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE)) - flags |= XFS_ILOG_CORE; + if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) { + if (IS_I_VERSION(inode) && + inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE)) + iversion_flags = XFS_ILOG_CORE; } - tp->t_flags |= XFS_TRANS_DIRTY; + /* + * Record the specific change for fdatasync optimisation. This allows + * fdatasync to skip log forces for inodes that are only timestamp + * dirty. + */ + spin_lock(&iip->ili_lock); + iip->ili_fsync_fields |= flags; + + if (!iip->ili_item.li_buf) { + struct xfs_buf *bp; + int error; + + /* + * We hold the ILOCK here, so this inode is not going to be + * flushed while we are here. Further, because there is no + * buffer attached to the item, we know that there is no IO in + * progress, so nothing will clear the ili_fields while we read + * in the buffer. Hence we can safely drop the spin lock and + * read the buffer knowing that the state will not change from + * here. + */ + spin_unlock(&iip->ili_lock); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, NULL, + &bp, 0); + if (error) { + xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); + return; + } + + /* + * We need an explicit buffer reference for the log item but + * don't want the buffer to remain attached to the transaction. + * Hold the buffer but release the transaction reference once + * we've attached the inode log item to the buffer log item + * list. + */ + xfs_buf_hold(bp); + spin_lock(&iip->ili_lock); + iip->ili_item.li_buf = bp; + bp->b_flags |= _XBF_INODES; + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); + xfs_trans_brelse(tp, bp); + } /* - * Always OR in the bits from the ili_last_fields field. - * This is to coordinate with the xfs_iflush() and xfs_iflush_done() - * routines in the eventual clearing of the ili_fields bits. - * See the big comment in xfs_iflush() for an explanation of - * this coordination mechanism. + * Always OR in the bits from the ili_last_fields field. This is to + * coordinate with the xfs_iflush() and xfs_iflush_done() routines in + * the eventual clearing of the ili_fields bits. See the big comment in + * xfs_iflush() for an explanation of this coordination mechanism. */ - flags |= ip->i_itemp->ili_last_fields; - ip->i_itemp->ili_fields |= flags; + iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); + spin_unlock(&iip->ili_lock); } int diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 88221c7a04cc..c6df01a2a158 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -57,7 +57,7 @@ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ (M_IGEO(mp)->ialloc_blks + \ - (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ + ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \ (M_IGEO(mp)->inobt_maxlevels - 1))) /* diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 7badd6dfe544..955302e7cdde 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -45,9 +45,27 @@ xchk_setup_inode_bmap( */ if (S_ISREG(VFS_I(sc->ip)->i_mode) && sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { + struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + inode_dio_wait(VFS_I(sc->ip)); - error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping); - if (error) + + /* + * Try to flush all incore state to disk before we examine the + * space mappings for the data fork. Leave accumulated errors + * in the mapping for the writer threads to consume. + * + * On ENOSPC or EIO writeback errors, we continue into the + * extent mapping checks because write failures do not + * necessarily imply anything about the correctness of the file + * metadata. The metadata and the file data could be on + * completely separate devices; a media failure might only + * affect a subset of the disk, etc. We can handle delalloc + * extents in the scrubber, so leaving them in memory is fine. + */ + error = filemap_fdatawrite(mapping); + if (!error) + error = filemap_fdatawait_keep_errors(mapping); + if (error && (error != -ENOSPC && error != -EIO)) goto out; } diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 44b15015021f..e56786f0a13c 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -476,9 +476,7 @@ xchk_da_btree( ds.dargs.whichfork = whichfork; ds.dargs.trans = sc->tp; ds.dargs.op_flags = XFS_DA_OP_OKNOENT; - ds.state = xfs_da_state_alloc(); - ds.state->args = &ds.dargs; - ds.state->mp = mp; + ds.state = xfs_da_state_alloc(&ds.dargs); ds.sc = sc; ds.private = private; if (whichfork == XFS_ATTR_FORK) { diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 905a34558361..e34ca20ae8e4 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -18,17 +18,17 @@ #include "scrub/common.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ -static inline uint +static inline xfs_dqtype_t xchk_quota_to_dqtype( struct xfs_scrub *sc) { switch (sc->sm->sm_type) { case XFS_SCRUB_TYPE_UQUOTA: - return XFS_DQ_USER; + return XFS_DQTYPE_USER; case XFS_SCRUB_TYPE_GQUOTA: - return XFS_DQ_GROUP; + return XFS_DQTYPE_GROUP; case XFS_SCRUB_TYPE_PQUOTA: - return XFS_DQ_PROJ; + return XFS_DQTYPE_PROJ; default: return 0; } @@ -40,7 +40,7 @@ xchk_setup_quota( struct xfs_scrub *sc, struct xfs_inode *ip) { - uint dqtype; + xfs_dqtype_t dqtype; int error; if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp)) @@ -73,26 +73,15 @@ struct xchk_quota_info { STATIC int xchk_quota_item( struct xfs_dquot *dq, - uint dqtype, + xfs_dqtype_t dqtype, void *priv) { struct xchk_quota_info *sqi = priv; struct xfs_scrub *sc = sqi->sc; struct xfs_mount *mp = sc->mp; - struct xfs_disk_dquot *d = &dq->q_core; struct xfs_quotainfo *qi = mp->m_quotainfo; xfs_fileoff_t offset; - unsigned long long bsoft; - unsigned long long isoft; - unsigned long long rsoft; - unsigned long long bhard; - unsigned long long ihard; - unsigned long long rhard; - unsigned long long bcount; - unsigned long long icount; - unsigned long long rcount; xfs_ino_t fs_icount; - xfs_dqid_t id = be32_to_cpu(d->d_id); int error = 0; if (xchk_should_terminate(sc, &error)) @@ -102,27 +91,11 @@ xchk_quota_item( * Except for the root dquot, the actual dquot we got must either have * the same or higher id as we saw before. */ - offset = id / qi->qi_dqperchunk; - if (id && id <= sqi->last_id) + offset = dq->q_id / qi->qi_dqperchunk; + if (dq->q_id && dq->q_id <= sqi->last_id) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - sqi->last_id = id; - - /* Did we get the dquot type we wanted? */ - if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES)) - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - - if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0)) - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - - /* Check the limits. */ - bhard = be64_to_cpu(d->d_blk_hardlimit); - ihard = be64_to_cpu(d->d_ino_hardlimit); - rhard = be64_to_cpu(d->d_rtb_hardlimit); - - bsoft = be64_to_cpu(d->d_blk_softlimit); - isoft = be64_to_cpu(d->d_ino_softlimit); - rsoft = be64_to_cpu(d->d_rtb_softlimit); + sqi->last_id = dq->q_id; /* * Warn if the hard limits are larger than the fs. @@ -132,25 +105,22 @@ xchk_quota_item( * Complain about corruption if the soft limit is greater than * the hard limit. */ - if (bhard > mp->m_sb.sb_dblocks) + if (dq->q_blk.hardlimit > mp->m_sb.sb_dblocks) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (bsoft > bhard) + if (dq->q_blk.softlimit > dq->q_blk.hardlimit) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - if (ihard > M_IGEO(mp)->maxicount) + if (dq->q_ino.hardlimit > M_IGEO(mp)->maxicount) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (isoft > ihard) + if (dq->q_ino.softlimit > dq->q_ino.hardlimit) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - if (rhard > mp->m_sb.sb_rblocks) + if (dq->q_rtb.hardlimit > mp->m_sb.sb_rblocks) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (rsoft > rhard) + if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); /* Check the resource counts. */ - bcount = be64_to_cpu(d->d_bcount); - icount = be64_to_cpu(d->d_icount); - rcount = be64_to_cpu(d->d_rtbcount); fs_icount = percpu_counter_sum(&mp->m_icount); /* @@ -159,15 +129,15 @@ xchk_quota_item( * if there are no quota limits. */ if (xfs_sb_version_hasreflink(&mp->m_sb)) { - if (mp->m_sb.sb_dblocks < bcount) + if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); } else { - if (mp->m_sb.sb_dblocks < bcount) + if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } - if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks) + if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); /* @@ -175,13 +145,22 @@ xchk_quota_item( * lower limit than the actual usage. However, we flag it for * admin review. */ - if (id != 0 && bhard != 0 && bcount > bhard) + if (dq->q_id == 0) + goto out; + + if (dq->q_blk.hardlimit != 0 && + dq->q_blk.count > dq->q_blk.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (id != 0 && ihard != 0 && icount > ihard) + + if (dq->q_ino.hardlimit != 0 && + dq->q_ino.count > dq->q_ino.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (id != 0 && rhard != 0 && rcount > rhard) + + if (dq->q_rtb.hardlimit != 0 && + dq->q_rtb.count > dq->q_rtb.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); +out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -EFSCORRUPTED; @@ -235,7 +214,7 @@ xchk_quota( struct xchk_quota_info sqi; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; - uint dqtype; + xfs_dqtype_t dqtype; int error = 0; dqtype = xchk_quota_to_dqtype(sc); diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index db3cfd12803d..25e86c71e7b9 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -899,11 +899,11 @@ xrep_find_ag_btree_roots( void xrep_force_quotacheck( struct xfs_scrub *sc, - uint dqtype) + xfs_dqtype_t type) { uint flag; - flag = xfs_quota_chkd_flag(dqtype); + flag = xfs_quota_chkd_flag(type); if (!(flag & sc->mp->m_qflags)) return; @@ -939,11 +939,11 @@ xrep_ino_dqattach( "inode %llu repair encountered quota error %d, quotacheck forced.", (unsigned long long)sc->ip->i_ino, error); if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot) - xrep_force_quotacheck(sc, XFS_DQ_USER); + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot) - xrep_force_quotacheck(sc, XFS_DQ_GROUP); + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) - xrep_force_quotacheck(sc, XFS_DQ_PROJ); + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); /* fall through */ case -ESRCH: error = 0; diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 04a47d45605b..fe77de01abe0 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -6,6 +6,8 @@ #ifndef __XFS_SCRUB_REPAIR_H__ #define __XFS_SCRUB_REPAIR_H__ +#include "xfs_quota_defs.h" + static inline int xrep_notsupported(struct xfs_scrub *sc) { return -EOPNOTSUPP; @@ -49,7 +51,7 @@ struct xrep_find_ag_btree { int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); -void xrep_force_quotacheck(struct xfs_scrub *sc, uint dqtype); +void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); /* Metadata repairers */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index c642bc206c41..76e4ffe0315b 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -13,6 +13,7 @@ #include "xfs_trans.h" #include "xfs_rtalloc.h" #include "xfs_inode.h" +#include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -58,6 +59,41 @@ xchk_rtbitmap_rec( return 0; } +/* Make sure the entire rtbitmap file is mapped with written extents. */ +STATIC int +xchk_rtbitmap_check_extents( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_bmbt_irec map; + xfs_rtblock_t off; + int nmap; + int error = 0; + + for (off = 0; off < mp->m_sb.sb_rbmblocks;) { + if (xchk_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + break; + + /* Make sure we have a written extent. */ + nmap = 1; + error = xfs_bmapi_read(mp->m_rbmip, off, + mp->m_sb.sb_rbmblocks - off, &map, &nmap, + XFS_DATA_FORK); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + break; + + if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + break; + } + + off += map.br_blockcount; + } + + return error; +} + /* Scrub the realtime bitmap. */ int xchk_rtbitmap( @@ -65,11 +101,22 @@ xchk_rtbitmap( { int error; + /* Is the size of the rtbitmap correct? */ + if (sc->mp->m_rbmip->i_d.di_size != + XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { + xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); + return 0; + } + /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; + error = xchk_rtbitmap_check_extents(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 6736c5ab188f..ec3691372e7c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -138,7 +138,7 @@ xfs_bui_init( { struct xfs_bui_log_item *buip; - buip = kmem_zone_zalloc(xfs_bui_zone, 0); + buip = kmem_cache_zalloc(xfs_bui_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; @@ -215,7 +215,7 @@ xfs_trans_get_bud( { struct xfs_bud_log_item *budp; - budp = kmem_zone_zalloc(xfs_bud_zone, 0); + budp = kmem_cache_zalloc(xfs_bud_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); budp->bud_buip = buip; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 30525861c596..73cafc843cd7 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1567,6 +1567,7 @@ xfs_swap_extents( int lock_flags; uint64_t f; int resblks = 0; + unsigned int flags = 0; /* * Lock the inodes against other IO, page faults and truncate to @@ -1630,17 +1631,16 @@ xfs_swap_extents( resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w); /* - * Handle the corner case where either inode might straddle the - * btree format boundary. If so, the inode could bounce between - * btree <-> extent format on unmap -> remap cycles, freeing and - * allocating a bmapbt block each time. + * If either inode straddles a bmapbt block allocation boundary, + * the rmapbt algorithm triggers repeated allocs and frees as + * extents are remapped. This can exhaust the block reservation + * prematurely and cause shutdown. Return freed blocks to the + * transaction reservation to counter this behavior. */ - if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1)) - resblks += XFS_IFORK_MAXEXT(ip, w); - if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1)) - resblks += XFS_IFORK_MAXEXT(tip, w); + flags |= XFS_TRANS_RES_FDBLKS; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags, + &tp); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 20b748f7e186..d4cdcb6fb2fe 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -14,6 +14,9 @@ #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_log_recover.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" #include "xfs_errortag.h" #include "xfs_error.h" @@ -211,9 +214,7 @@ _xfs_buf_alloc( int i; *bpp = NULL; - bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); - if (unlikely(!bp)) - return -ENOMEM; + bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL); /* * We don't want certain flags to appear in b_flags unless they are @@ -655,7 +656,6 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; bp->b_ops = NULL; } @@ -1191,10 +1191,13 @@ xfs_buf_ioend( if (!bp->b_error && bp->b_io_error) xfs_buf_ioerror(bp, bp->b_io_error); - /* Only validate buffers that were read without errors */ - if (read && !bp->b_error && bp->b_ops) { - ASSERT(!bp->b_iodone); - bp->b_ops->verify_read(bp); + if (read) { + if (!bp->b_error && bp->b_ops) + bp->b_ops->verify_read(bp); + if (!bp->b_error) + bp->b_flags |= XBF_DONE; + xfs_buf_ioend_finish(bp); + return; } if (!bp->b_error) { @@ -1202,12 +1205,25 @@ xfs_buf_ioend( bp->b_flags |= XBF_DONE; } - if (bp->b_iodone) - (*(bp->b_iodone))(bp); - else if (bp->b_flags & XBF_ASYNC) - xfs_buf_relse(bp); - else - complete(&bp->b_iowait); + /* + * If this is a log recovery buffer, we aren't doing transactional IO + * yet so we need to let it handle IO completions. + */ + if (bp->b_flags & _XBF_LOGRECOVERY) { + xlog_recover_iodone(bp); + return; + } + + if (bp->b_flags & _XBF_INODES) { + xfs_buf_inode_iodone(bp); + return; + } + + if (bp->b_flags & _XBF_DQUOTS) { + xfs_buf_dquot_iodone(bp); + return; + } + xfs_buf_iodone(bp); } static void diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 050c53b739e2..755b652e695a 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -18,6 +18,7 @@ /* * Base types */ +struct xfs_buf; #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) @@ -30,15 +31,20 @@ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ #define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */ -/* flags used only as arguments to access routines */ -#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ +/* buffer type flags for write callbacks */ +#define _XBF_INODES (1 << 16)/* inode buffer */ +#define _XBF_DQUOTS (1 << 17)/* dquot buffer */ +#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */ /* flags used only internally */ #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +/* flags used only as arguments to access routines */ +#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */ + typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ @@ -50,12 +56,15 @@ typedef unsigned int xfs_buf_flags_t; { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ - { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ - { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ + { _XBF_INODES, "INODES" }, \ + { _XBF_DQUOTS, "DQUOTS" }, \ + { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ - { _XBF_DELWRI_Q, "DELWRI_Q" } - + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ + /* The following interface flags should never be set */ \ + { XBF_TRYLOCK, "TRYLOCK" }, \ + { XBF_UNMAPPED, "UNMAPPED" } /* * Internal state flags. @@ -94,10 +103,6 @@ typedef struct xfs_buftarg { struct ratelimit_state bt_ioerror_rl; } xfs_buftarg_t; -struct xfs_buf; -typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); - - #define XB_PAGES 2 struct xfs_buf_map { @@ -150,7 +155,6 @@ typedef struct xfs_buf { xfs_buftarg_t *b_target; /* buffer target (device) */ void *b_addr; /* virtual address of buffer */ struct work_struct b_ioend_work; - xfs_buf_iodone_t b_iodone; /* I/O completion function */ struct completion b_iowait; /* queue for I/O waiters */ struct xfs_buf_log_item *b_log_item; struct list_head b_li_list; /* Log items list head */ @@ -257,9 +261,23 @@ extern void xfs_buf_unlock(xfs_buf_t *); #define xfs_buf_islocked(bp) \ ((bp)->b_sema.count <= 0) +static inline void xfs_buf_relse(xfs_buf_t *bp) +{ + xfs_buf_unlock(bp); + xfs_buf_rele(bp); +} + /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); extern void xfs_buf_ioend(struct xfs_buf *bp); +static inline void xfs_buf_ioend_finish(struct xfs_buf *bp) +{ + if (bp->b_flags & XBF_ASYNC) + xfs_buf_relse(bp); + else + complete(&bp->b_iowait); +} + extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) @@ -324,12 +342,6 @@ static inline int xfs_buf_ispinned(struct xfs_buf *bp) return atomic_read(&bp->b_pin_count); } -static inline void xfs_buf_relse(xfs_buf_t *bp) -{ - xfs_buf_unlock(bp); - xfs_buf_rele(bp); -} - static inline int xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset) { diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 9e75e8d6042e..5bb6f22cc11a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -12,8 +12,13 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_trans.h" -#include "xfs_buf_item.h" #include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_quota.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" #include "xfs_trace.h" #include "xfs_log.h" @@ -25,7 +30,7 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } -STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); +static void xfs_buf_item_done(struct xfs_buf *bp); /* Is this log iovec plausibly large enough to contain the buffer log format? */ bool @@ -457,10 +462,9 @@ xfs_buf_item_unpin( * the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { - xfs_buf_do_callbacks(bp); - bp->b_log_item = NULL; - list_del_init(&bp->b_li_list); - bp->b_iodone = NULL; + xfs_buf_item_done(bp); + xfs_iflush_done(bp); + ASSERT(list_empty(&bp->b_li_list)); } else { xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); @@ -734,7 +738,7 @@ xfs_buf_item_init( return 0; } - bip = kmem_zone_zalloc(xfs_buf_item_zone, 0); + bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; @@ -936,11 +940,7 @@ xfs_buf_item_free( } /* - * This is called when the buf log item is no longer needed. It should - * free the buf log item associated with the given buffer and clear - * the buffer's pointer to the buf log item. If there are no more - * items in the list, clear the b_iodone field of the buffer (see - * xfs_buf_attach_iodone() below). + * xfs_buf_item_relse() is called when the buf log item is no longer needed. */ void xfs_buf_item_relse( @@ -952,137 +952,28 @@ xfs_buf_item_relse( ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); bp->b_log_item = NULL; - if (list_empty(&bp->b_li_list)) - bp->b_iodone = NULL; - xfs_buf_rele(bp); xfs_buf_item_free(bip); } - /* - * Add the given log item with its callback to the list of callbacks - * to be called when the buffer's I/O completes. If it is not set - * already, set the buffer's b_iodone() routine to be - * xfs_buf_iodone_callbacks() and link the log item into the list of - * items rooted at b_li_list. + * Decide if we're going to retry the write after a failure, and prepare + * the buffer for retrying the write. */ -void -xfs_buf_attach_iodone( - struct xfs_buf *bp, - void (*cb)(struct xfs_buf *, struct xfs_log_item *), - struct xfs_log_item *lip) -{ - ASSERT(xfs_buf_islocked(bp)); - - lip->li_cb = cb; - list_add_tail(&lip->li_bio_list, &bp->b_li_list); - - ASSERT(bp->b_iodone == NULL || - bp->b_iodone == xfs_buf_iodone_callbacks); - bp->b_iodone = xfs_buf_iodone_callbacks; -} - -/* - * We can have many callbacks on a buffer. Running the callbacks individually - * can cause a lot of contention on the AIL lock, so we allow for a single - * callback to be able to scan the remaining items in bp->b_li_list for other - * items of the same type and callback to be processed in the first call. - * - * As a result, the loop walking the callback list below will also modify the - * list. it removes the first item from the list and then runs the callback. - * The loop then restarts from the new first item int the list. This allows the - * callback to scan and modify the list attached to the buffer and we don't - * have to care about maintaining a next item pointer. - */ -STATIC void -xfs_buf_do_callbacks( - struct xfs_buf *bp) -{ - struct xfs_buf_log_item *blip = bp->b_log_item; - struct xfs_log_item *lip; - - /* If there is a buf_log_item attached, run its callback */ - if (blip) { - lip = &blip->bli_item; - lip->li_cb(bp, lip); - } - - while (!list_empty(&bp->b_li_list)) { - lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, - li_bio_list); - - /* - * Remove the item from the list, so we don't have any - * confusion if the item is added to another buf. - * Don't touch the log item after calling its - * callback, because it could have freed itself. - */ - list_del_init(&lip->li_bio_list); - lip->li_cb(bp, lip); - } -} - -/* - * Invoke the error state callback for each log item affected by the failed I/O. - * - * If a metadata buffer write fails with a non-permanent error, the buffer is - * eventually resubmitted and so the completion callbacks are not run. The error - * state may need to be propagated to the log items attached to the buffer, - * however, so the next AIL push of the item knows hot to handle it correctly. - */ -STATIC void -xfs_buf_do_callbacks_fail( - struct xfs_buf *bp) -{ - struct xfs_log_item *lip; - struct xfs_ail *ailp; - - /* - * Buffer log item errors are handled directly by xfs_buf_item_push() - * and xfs_buf_iodone_callback_error, and they have no IO error - * callbacks. Check only for items in b_li_list. - */ - if (list_empty(&bp->b_li_list)) - return; - - lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, - li_bio_list); - ailp = lip->li_ailp; - spin_lock(&ailp->ail_lock); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (lip->li_ops->iop_error) - lip->li_ops->iop_error(lip, bp); - } - spin_unlock(&ailp->ail_lock); -} - static bool -xfs_buf_iodone_callback_error( +xfs_buf_ioerror_fail_without_retry( struct xfs_buf *bp) { - struct xfs_buf_log_item *bip = bp->b_log_item; - struct xfs_log_item *lip; - struct xfs_mount *mp; + struct xfs_mount *mp = bp->b_mount; static ulong lasttime; static xfs_buftarg_t *lasttarg; - struct xfs_error_cfg *cfg; - - /* - * The failed buffer might not have a buf_log_item attached or the - * log_item list might be empty. Get the mp from the available - * xfs_log_item - */ - lip = list_first_entry_or_null(&bp->b_li_list, struct xfs_log_item, - li_bio_list); - mp = lip ? lip->li_mountp : bip->bli_item.li_mountp; /* * If we've already decided to shutdown the filesystem because of * I/O errors, there's no point in giving this a retry. */ if (XFS_FORCED_SHUTDOWN(mp)) - goto out_stale; + return true; if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { @@ -1093,129 +984,240 @@ xfs_buf_iodone_callback_error( /* synchronous writes will have callers process the error */ if (!(bp->b_flags & XBF_ASYNC)) - goto out_stale; - - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - ASSERT(bp->b_iodone != NULL); - - cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + return true; + return false; +} - /* - * If the write was asynchronous then no one will be looking for the - * error. If this is the first failure of this type, clear the error - * state and write the buffer out again. This means we always retry an - * async write failure at least once, but we also need to set the buffer - * up to behave correctly now for repeated failures. - */ - if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) || - bp->b_last_error != bp->b_error) { - bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); - bp->b_last_error = bp->b_error; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - !bp->b_first_retry_time) - bp->b_first_retry_time = jiffies; +static bool +xfs_buf_ioerror_retry( + struct xfs_buf *bp, + struct xfs_error_cfg *cfg) +{ + if ((bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) && + bp->b_last_error == bp->b_error) + return false; - xfs_buf_ioerror(bp, 0); - xfs_buf_submit(bp); - return true; - } + bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); + bp->b_last_error = bp->b_error; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; + return true; +} - /* - * Repeated failure on an async write. Take action according to the - * error configuration we have been set up to use. - */ +/* + * Account for this latest trip around the retry handler, and decide if + * we've failed enough times to constitute a permanent failure. + */ +static bool +xfs_buf_ioerror_permanent( + struct xfs_buf *bp, + struct xfs_error_cfg *cfg) +{ + struct xfs_mount *mp = bp->b_mount; if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && ++bp->b_retries > cfg->max_retries) - goto permanent_error; + return true; if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) - goto permanent_error; + return true; /* At unmount we may treat errors differently */ if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) - goto permanent_error; + return true; - /* - * Still a transient error, run IO completion failure callbacks and let - * the higher layers retry the buffer. - */ - xfs_buf_do_callbacks_fail(bp); - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return true; + return false; +} + +/* + * On a sync write or shutdown we just want to stale the buffer and let the + * caller handle the error in bp->b_error appropriately. + * + * If the write was asynchronous then no one will be looking for the error. If + * this is the first failure of this type, clear the error state and write the + * buffer out again. This means we always retry an async write failure at least + * once, but we also need to set the buffer up to behave correctly now for + * repeated failures. + * + * If we get repeated async write failures, then we take action according to the + * error configuration we have been set up to use. + * + * Multi-state return value: + * + * XBF_IOERROR_FINISH: clear IO error retry state and run callback completions + * XBF_IOERROR_DONE: resubmitted immediately, do not run any completions + * XBF_IOERROR_FAIL: transient error, run failure callback completions and then + * release the buffer + */ +enum { + XBF_IOERROR_FINISH, + XBF_IOERROR_DONE, + XBF_IOERROR_FAIL, +}; + +static int +xfs_buf_iodone_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_error_cfg *cfg; + + if (xfs_buf_ioerror_fail_without_retry(bp)) + goto out_stale; + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + if (xfs_buf_ioerror_retry(bp, cfg)) { + xfs_buf_ioerror(bp, 0); + xfs_buf_submit(bp); + return XBF_IOERROR_DONE; + } /* * Permanent error - we need to trigger a shutdown if we haven't already * to indicate that inconsistency will result from this action. */ -permanent_error: - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_buf_ioerror_permanent(bp, cfg)) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out_stale; + } + + /* Still considered a transient error. Caller will schedule retries. */ + return XBF_IOERROR_FAIL; + out_stale: xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; trace_xfs_buf_error_relse(bp, _RET_IP_); - return false; + return XBF_IOERROR_FINISH; } -/* - * This is the iodone() function for buffers which have had callbacks attached - * to them by xfs_buf_attach_iodone(). We need to iterate the items on the - * callback list, mark the buffer as having no more callbacks and then push the - * buffer through IO completion processing. - */ -void -xfs_buf_iodone_callbacks( +static void +xfs_buf_item_done( struct xfs_buf *bp) { - /* - * If there is an error, process it. Some errors require us - * to run callbacks after failure processing is done so we - * detect that and take appropriate action. - */ - if (bp->b_error && xfs_buf_iodone_callback_error(bp)) + struct xfs_buf_log_item *bip = bp->b_log_item; + + if (!bip) return; /* - * Successful IO or permanent error. Either way, we can clear the - * retry state here in preparation for the next error that may occur. + * If we are forcibly shutting down, this may well be off the AIL + * already. That's because we simulate the log-committed callbacks to + * unpin these buffers. Or we may never have put this item on AIL + * because of the transaction was aborted forcibly. + * xfs_trans_ail_delete() takes care of these. + * + * Either way, AIL is useless if we're forcing a shutdown. */ + xfs_trans_ail_delete(&bip->bli_item, SHUTDOWN_CORRUPT_INCORE); + bp->b_log_item = NULL; + xfs_buf_item_free(bip); + xfs_buf_rele(bp); +} + +static inline void +xfs_buf_clear_ioerror_retry_state( + struct xfs_buf *bp) +{ bp->b_last_error = 0; bp->b_retries = 0; bp->b_first_retry_time = 0; +} - xfs_buf_do_callbacks(bp); - bp->b_log_item = NULL; - list_del_init(&bp->b_li_list); - bp->b_iodone = NULL; - xfs_buf_ioend(bp); +/* + * Inode buffer iodone callback function. + */ +void +xfs_buf_inode_iodone( + struct xfs_buf *bp) +{ + if (bp->b_error) { + struct xfs_log_item *lip; + int ret = xfs_buf_iodone_error(bp); + + if (ret == XBF_IOERROR_FINISH) + goto finish_iodone; + if (ret == XBF_IOERROR_DONE) + return; + ASSERT(ret == XBF_IOERROR_FAIL); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { + set_bit(XFS_LI_FAILED, &lip->li_flags); + } + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return; + } + +finish_iodone: + xfs_buf_clear_ioerror_retry_state(bp); + xfs_buf_item_done(bp); + xfs_iflush_done(bp); + xfs_buf_ioend_finish(bp); } /* - * This is the iodone() function for buffers which have been - * logged. It is called when they are eventually flushed out. - * It should remove the buf item from the AIL, and free the buf item. - * It is called by xfs_buf_iodone_callbacks() above which will take - * care of cleaning up the buffer itself. + * Dquot buffer iodone callback function. */ void -xfs_buf_iodone( - struct xfs_buf *bp, - struct xfs_log_item *lip) +xfs_buf_dquot_iodone( + struct xfs_buf *bp) { - ASSERT(BUF_ITEM(lip)->bli_buf == bp); + if (bp->b_error) { + struct xfs_log_item *lip; + int ret = xfs_buf_iodone_error(bp); + + if (ret == XBF_IOERROR_FINISH) + goto finish_iodone; + if (ret == XBF_IOERROR_DONE) + return; + ASSERT(ret == XBF_IOERROR_FAIL); + spin_lock(&bp->b_mount->m_ail->ail_lock); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { + xfs_set_li_failed(lip, bp); + } + spin_unlock(&bp->b_mount->m_ail->ail_lock); + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return; + } - xfs_buf_rele(bp); +finish_iodone: + xfs_buf_clear_ioerror_retry_state(bp); + /* a newly allocated dquot buffer might have a log item attached */ + xfs_buf_item_done(bp); + xfs_dquot_done(bp); + xfs_buf_ioend_finish(bp); +} - /* - * If we are forcibly shutting down, this may well be off the AIL - * already. That's because we simulate the log-committed callbacks to - * unpin these buffers. Or we may never have put this item on AIL - * because of the transaction was aborted forcibly. - * xfs_trans_ail_delete() takes care of these. - * - * Either way, AIL is useless if we're forcing a shutdown. - */ - xfs_trans_ail_delete(lip, SHUTDOWN_CORRUPT_INCORE); - xfs_buf_item_free(BUF_ITEM(lip)); +/* + * Dirty buffer iodone callback function. + * + * Note that for things like remote attribute buffers, there may not be a buffer + * log item here, so processing the buffer log item must remain be optional. + */ +void +xfs_buf_iodone( + struct xfs_buf *bp) +{ + if (bp->b_error) { + int ret = xfs_buf_iodone_error(bp); + + if (ret == XBF_IOERROR_FINISH) + goto finish_iodone; + if (ret == XBF_IOERROR_DONE) + return; + ASSERT(ret == XBF_IOERROR_FAIL); + ASSERT(list_empty(&bp->b_li_list)); + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return; + } + +finish_iodone: + xfs_buf_clear_ioerror_retry_state(bp); + xfs_buf_item_done(bp); + xfs_buf_ioend_finish(bp); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index c9c57e2da932..23507cbb4c41 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -54,11 +54,9 @@ void xfs_buf_item_relse(struct xfs_buf *); bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); -void xfs_buf_attach_iodone(struct xfs_buf *, - void(*)(struct xfs_buf *, struct xfs_log_item *), - struct xfs_log_item *); -void xfs_buf_iodone_callbacks(struct xfs_buf *); -void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); +void xfs_buf_inode_iodone(struct xfs_buf *); +void xfs_buf_dquot_iodone(struct xfs_buf *); +void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 04faa7310c4f..d480f11e6b00 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -419,8 +419,7 @@ xlog_recover_validate_buf_type( if (bp->b_ops) { struct xfs_buf_log_item *bip; - ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); - bp->b_iodone = xlog_recover_iodone; + bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_item_init(bp, mp); bip = bp->b_log_item; bip->bli_item.li_lsn = current_lsn; @@ -494,8 +493,7 @@ xlog_recover_do_reg_buffer( item->ri_buf[i].i_len, __func__); goto next; } - fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, - -1, 0); + fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); if (fa) { xfs_alert(mp, "dquot corrupt at %pS trying to replay into block 0x%llx", @@ -548,11 +546,11 @@ xlog_recover_do_dquot_buffer( type = 0; if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) - type |= XFS_DQ_USER; + type |= XFS_DQTYPE_USER; if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) - type |= XFS_DQ_PROJ; + type |= XFS_DQTYPE_PROJ; if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) - type |= XFS_DQ_GROUP; + type |= XFS_DQTYPE_GROUP; /* * This type of quotas was turned off, so ignore this buffer */ @@ -963,7 +961,7 @@ xlog_recover_buf_commit_pass2( error = xfs_bwrite(bp); } else { ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; + bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_delwri_queue(bp, buffer_list); } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index d5b7f03e93c8..04dc2be19c3a 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -23,6 +23,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_bmap_btree.h" +#include "xfs_error.h" /* * Lock order: @@ -66,39 +67,61 @@ xfs_qm_dqdestroy( */ void xfs_qm_adjust_dqlimits( - struct xfs_mount *mp, struct xfs_dquot *dq) { + struct xfs_mount *mp = dq->q_mount; struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_disk_dquot *d = &dq->q_core; struct xfs_def_quota *defq; int prealloc = 0; - ASSERT(d->d_id); + ASSERT(dq->q_id); defq = xfs_get_defquota(q, xfs_dquot_type(dq)); - if (defq->bsoftlimit && !d->d_blk_softlimit) { - d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit); + if (!dq->q_blk.softlimit) { + dq->q_blk.softlimit = defq->blk.soft; prealloc = 1; } - if (defq->bhardlimit && !d->d_blk_hardlimit) { - d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit); + if (!dq->q_blk.hardlimit) { + dq->q_blk.hardlimit = defq->blk.hard; prealloc = 1; } - if (defq->isoftlimit && !d->d_ino_softlimit) - d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit); - if (defq->ihardlimit && !d->d_ino_hardlimit) - d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit); - if (defq->rtbsoftlimit && !d->d_rtb_softlimit) - d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit); - if (defq->rtbhardlimit && !d->d_rtb_hardlimit) - d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit); + if (!dq->q_ino.softlimit) + dq->q_ino.softlimit = defq->ino.soft; + if (!dq->q_ino.hardlimit) + dq->q_ino.hardlimit = defq->ino.hard; + if (!dq->q_rtb.softlimit) + dq->q_rtb.softlimit = defq->rtb.soft; + if (!dq->q_rtb.hardlimit) + dq->q_rtb.hardlimit = defq->rtb.hard; if (prealloc) xfs_dquot_set_prealloc_limits(dq); } /* + * Determine if this quota counter is over either limit and set the quota + * timers as appropriate. + */ +static inline void +xfs_qm_adjust_res_timer( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim) +{ + ASSERT(res->hardlimit == 0 || res->softlimit <= res->hardlimit); + + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (res->timer == 0) + res->timer = ktime_get_real_seconds() + qlim->time; + } else { + if (res->timer == 0) + res->warnings = 0; + else + res->timer = 0; + } +} + +/* * Check the limits and timers of a dquot and start or reset timers * if necessary. * This gets called even when quota enforcement is OFF, which makes our @@ -113,96 +136,18 @@ xfs_qm_adjust_dqlimits( */ void xfs_qm_adjust_dqtimers( - struct xfs_mount *mp, struct xfs_dquot *dq) { + struct xfs_mount *mp = dq->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; - struct xfs_disk_dquot *d = &dq->q_core; struct xfs_def_quota *defq; - ASSERT(d->d_id); + ASSERT(dq->q_id); defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); -#ifdef DEBUG - if (d->d_blk_hardlimit) - ASSERT(be64_to_cpu(d->d_blk_softlimit) <= - be64_to_cpu(d->d_blk_hardlimit)); - if (d->d_ino_hardlimit) - ASSERT(be64_to_cpu(d->d_ino_softlimit) <= - be64_to_cpu(d->d_ino_hardlimit)); - if (d->d_rtb_hardlimit) - ASSERT(be64_to_cpu(d->d_rtb_softlimit) <= - be64_to_cpu(d->d_rtb_hardlimit)); -#endif - - if (!d->d_btimer) { - if ((d->d_blk_softlimit && - (be64_to_cpu(d->d_bcount) > - be64_to_cpu(d->d_blk_softlimit))) || - (d->d_blk_hardlimit && - (be64_to_cpu(d->d_bcount) > - be64_to_cpu(d->d_blk_hardlimit)))) { - d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + - defq->btimelimit); - } else { - d->d_bwarns = 0; - } - } else { - if ((!d->d_blk_softlimit || - (be64_to_cpu(d->d_bcount) <= - be64_to_cpu(d->d_blk_softlimit))) && - (!d->d_blk_hardlimit || - (be64_to_cpu(d->d_bcount) <= - be64_to_cpu(d->d_blk_hardlimit)))) { - d->d_btimer = 0; - } - } - - if (!d->d_itimer) { - if ((d->d_ino_softlimit && - (be64_to_cpu(d->d_icount) > - be64_to_cpu(d->d_ino_softlimit))) || - (d->d_ino_hardlimit && - (be64_to_cpu(d->d_icount) > - be64_to_cpu(d->d_ino_hardlimit)))) { - d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + - defq->itimelimit); - } else { - d->d_iwarns = 0; - } - } else { - if ((!d->d_ino_softlimit || - (be64_to_cpu(d->d_icount) <= - be64_to_cpu(d->d_ino_softlimit))) && - (!d->d_ino_hardlimit || - (be64_to_cpu(d->d_icount) <= - be64_to_cpu(d->d_ino_hardlimit)))) { - d->d_itimer = 0; - } - } - - if (!d->d_rtbtimer) { - if ((d->d_rtb_softlimit && - (be64_to_cpu(d->d_rtbcount) > - be64_to_cpu(d->d_rtb_softlimit))) || - (d->d_rtb_hardlimit && - (be64_to_cpu(d->d_rtbcount) > - be64_to_cpu(d->d_rtb_hardlimit)))) { - d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + - defq->rtbtimelimit); - } else { - d->d_rtbwarns = 0; - } - } else { - if ((!d->d_rtb_softlimit || - (be64_to_cpu(d->d_rtbcount) <= - be64_to_cpu(d->d_rtb_softlimit))) && - (!d->d_rtb_hardlimit || - (be64_to_cpu(d->d_rtbcount) <= - be64_to_cpu(d->d_rtb_hardlimit)))) { - d->d_rtbtimer = 0; - } - } + xfs_qm_adjust_res_timer(&dq->q_blk, &defq->blk); + xfs_qm_adjust_res_timer(&dq->q_ino, &defq->ino); + xfs_qm_adjust_res_timer(&dq->q_rtb, &defq->rtb); } /* @@ -213,7 +158,7 @@ xfs_qm_init_dquot_blk( struct xfs_trans *tp, struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct xfs_buf *bp) { struct xfs_quotainfo *q = mp->m_quotainfo; @@ -226,6 +171,24 @@ xfs_qm_init_dquot_blk( ASSERT(tp); ASSERT(xfs_buf_islocked(bp)); + switch (type) { + case XFS_DQTYPE_USER: + qflag = XFS_UQUOTA_CHKD; + blftype = XFS_BLF_UDQUOT_BUF; + break; + case XFS_DQTYPE_PROJ: + qflag = XFS_PQUOTA_CHKD; + blftype = XFS_BLF_PDQUOT_BUF; + break; + case XFS_DQTYPE_GROUP: + qflag = XFS_GQUOTA_CHKD; + blftype = XFS_BLF_GDQUOT_BUF; + break; + default: + ASSERT(0); + return; + } + d = bp->b_addr; /* @@ -237,7 +200,7 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); - d->dd_diskdq.d_flags = type; + d->dd_diskdq.d_type = type; if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), @@ -245,17 +208,6 @@ xfs_qm_init_dquot_blk( } } - if (type & XFS_DQ_USER) { - qflag = XFS_UQUOTA_CHKD; - blftype = XFS_BLF_UDQUOT_BUF; - } else if (type & XFS_DQ_PROJ) { - qflag = XFS_PQUOTA_CHKD; - blftype = XFS_BLF_PDQUOT_BUF; - } else { - qflag = XFS_GQUOTA_CHKD; - blftype = XFS_BLF_GDQUOT_BUF; - } - xfs_trans_dquot_buf(tp, bp, blftype); /* @@ -290,8 +242,8 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) { uint64_t space; - dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); - dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); + dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit; + dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit; if (!dqp->q_prealloc_lo_wmark) { dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; do_div(dqp->q_prealloc_lo_wmark, 100); @@ -321,14 +273,15 @@ xfs_dquot_disk_alloc( struct xfs_trans *tp = *tpp; struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp; - struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); + xfs_dqtype_t qtype = xfs_dquot_type(dqp); + struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); int nmaps = 1; int error; trace_xfs_dqalloc(dqp); xfs_ilock(quotip, XFS_ILOCK_EXCL); - if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + if (!xfs_this_quota_on(dqp->q_mount, qtype)) { /* * Return if this type of quotas is turned off while we didn't * have an inode lock @@ -365,8 +318,7 @@ xfs_dquot_disk_alloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), - dqp->dq_flags & XFS_DQ_ALLTYPES, bp); + xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* @@ -413,13 +365,14 @@ xfs_dquot_disk_read( { struct xfs_bmbt_irec map; struct xfs_buf *bp; - struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); + xfs_dqtype_t qtype = xfs_dquot_type(dqp); + struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); uint lock_mode; int nmaps = 1; int error; lock_mode = xfs_ilock_data_map_shared(quotip); - if (!xfs_this_quota_on(mp, dqp->dq_flags)) { + if (!xfs_this_quota_on(mp, qtype)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. @@ -471,14 +424,14 @@ STATIC struct xfs_dquot * xfs_dquot_alloc( struct xfs_mount *mp, xfs_dqid_t id, - uint type) + xfs_dqtype_t type) { struct xfs_dquot *dqp; - dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0); + dqp = kmem_cache_zalloc(xfs_qm_dqzone, GFP_KERNEL | __GFP_NOFAIL); - dqp->dq_flags = type; - dqp->q_core.d_id = cpu_to_be32(id); + dqp->q_type = type; + dqp->q_id = id; dqp->q_mount = mp; INIT_LIST_HEAD(&dqp->q_lru); mutex_init(&dqp->q_qlock); @@ -503,13 +456,13 @@ xfs_dquot_alloc( * quotas. */ switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: /* uses the default lock class */ break; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); break; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); break; default: @@ -524,26 +477,91 @@ xfs_dquot_alloc( } /* Copy the in-core quota fields in from the on-disk buffer. */ -STATIC void +STATIC int xfs_dquot_from_disk( struct xfs_dquot *dqp, struct xfs_buf *bp) { struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; + /* + * Ensure that we got the type and ID we were looking for. + * Everything else was checked by the dquot buffer verifier. + */ + if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) || + be32_to_cpu(ddqp->d_id) != dqp->q_id) { + xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, + "Metadata corruption detected at %pS, quota %u", + __this_address, dqp->q_id); + xfs_alert(bp->b_mount, "Unmount and run xfs_repair"); + return -EFSCORRUPTED; + } + /* copy everything from disk dquot to the incore dquot */ - memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot)); + dqp->q_type = ddqp->d_type; + dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); + dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit); + dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); + dqp->q_ino.softlimit = be64_to_cpu(ddqp->d_ino_softlimit); + dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); + dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + + dqp->q_blk.count = be64_to_cpu(ddqp->d_bcount); + dqp->q_ino.count = be64_to_cpu(ddqp->d_icount); + dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount); + + dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns); + dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); + dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); + + dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer); + dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer); + dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer); /* * Reservation counters are defined as reservation plus current usage * to avoid having to add every time. */ - dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); - dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); - dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); + dqp->q_blk.reserved = dqp->q_blk.count; + dqp->q_ino.reserved = dqp->q_ino.count; + dqp->q_rtb.reserved = dqp->q_rtb.count; /* initialize the dquot speculative prealloc thresholds */ xfs_dquot_set_prealloc_limits(dqp); + return 0; +} + +/* Copy the in-core quota fields into the on-disk buffer. */ +void +xfs_dquot_to_disk( + struct xfs_disk_dquot *ddqp, + struct xfs_dquot *dqp) +{ + ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + ddqp->d_version = XFS_DQUOT_VERSION; + ddqp->d_type = dqp->q_type; + ddqp->d_id = cpu_to_be32(dqp->q_id); + ddqp->d_pad0 = 0; + ddqp->d_pad = 0; + + ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit); + ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit); + ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit); + ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit); + ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit); + ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit); + + ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count); + ddqp->d_icount = cpu_to_be64(dqp->q_ino.count); + ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count); + + ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings); + ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); + ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); + + ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer); + ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer); + ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer); } /* Allocate and initialize the dquot buffer for this in-core dquot. */ @@ -592,7 +610,7 @@ static int xfs_qm_dqread( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **dqpp) { @@ -617,9 +635,11 @@ xfs_qm_dqread( * further. */ ASSERT(xfs_buf_islocked(bp)); - xfs_dquot_from_disk(dqp, bp); - + error = xfs_dquot_from_disk(dqp, bp); xfs_buf_relse(bp); + if (error) + goto err; + *dqpp = dqp; return error; @@ -638,7 +658,7 @@ err: static int xfs_dq_get_next_id( struct xfs_mount *mp, - uint type, + xfs_dqtype_t type, xfs_dqid_t *id) { struct xfs_inode *quotip = xfs_quota_inode(mp, type); @@ -706,7 +726,7 @@ restart: } xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) { + if (dqp->q_flags & XFS_DQFLAG_FREEING) { xfs_dqunlock(dqp); mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_freeing(dqp); @@ -762,21 +782,21 @@ xfs_qm_dqget_cache_insert( static int xfs_qm_dqget_checks( struct xfs_mount *mp, - uint type) + xfs_dqtype_t type) { if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp))) return -ESRCH; switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: if (!XFS_IS_UQUOTA_ON(mp)) return -ESRCH; return 0; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: if (!XFS_IS_GQUOTA_ON(mp)) return -ESRCH; return 0; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: if (!XFS_IS_PQUOTA_ON(mp)) return -ESRCH; return 0; @@ -794,7 +814,7 @@ int xfs_qm_dqget( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **O_dqpp) { @@ -844,7 +864,7 @@ int xfs_qm_dqget_uncached( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct xfs_dquot **dqpp) { int error; @@ -860,14 +880,14 @@ xfs_qm_dqget_uncached( xfs_dqid_t xfs_qm_id_for_quotatype( struct xfs_inode *ip, - uint type) + xfs_dqtype_t type) { switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: return i_uid_read(VFS_I(ip)); - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return i_gid_read(VFS_I(ip)); - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return ip->i_d.di_projid; } ASSERT(0); @@ -882,7 +902,7 @@ xfs_qm_id_for_quotatype( int xfs_qm_dqget_inode( struct xfs_inode *ip, - uint type, + xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **O_dqpp) { @@ -968,7 +988,7 @@ int xfs_qm_dqget_next( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct xfs_dquot **dqpp) { struct xfs_dquot *dqp; @@ -1048,9 +1068,8 @@ xfs_qm_dqrele( * from the AIL if it has not been re-logged, and unlocking the dquot's * flush lock. This behavior is very similar to that of inodes.. */ -STATIC void +static void xfs_qm_dqflush_done( - struct xfs_buf *bp, struct xfs_log_item *lip) { struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; @@ -1071,16 +1090,12 @@ xfs_qm_dqflush_done( test_bit(XFS_LI_FAILED, &lip->li_flags))) { spin_lock(&ailp->ail_lock); + xfs_clear_li_failed(lip); if (lip->li_lsn == qip->qli_flush_lsn) { /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); } else { - /* - * Clear the failed state since we are about to drop the - * flush lock - */ - xfs_clear_li_failed(lip); spin_unlock(&ailp->ail_lock); } } @@ -1091,6 +1106,48 @@ xfs_qm_dqflush_done( xfs_dqfunlock(dqp); } +void +xfs_dquot_done( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip, *n; + + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + list_del_init(&lip->li_bio_list); + xfs_qm_dqflush_done(lip); + } +} + +/* Check incore dquot for errors before we flush. */ +static xfs_failaddr_t +xfs_qm_dqflush_check( + struct xfs_dquot *dqp) +{ + xfs_dqtype_t type = xfs_dquot_type(dqp); + + if (type != XFS_DQTYPE_USER && + type != XFS_DQTYPE_GROUP && + type != XFS_DQTYPE_PROJ) + return __this_address; + + if (dqp->q_id == 0) + return NULL; + + if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit && + !dqp->q_blk.timer) + return __this_address; + + if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit && + !dqp->q_ino.timer) + return __this_address; + + if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit && + !dqp->q_rtb.timer) + return __this_address; + + return NULL; +} + /* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. @@ -1107,8 +1164,7 @@ xfs_qm_dqflush( struct xfs_mount *mp = dqp->q_mount; struct xfs_log_item *lip = &dqp->q_logitem.qli_item; struct xfs_buf *bp; - struct xfs_dqblk *dqb; - struct xfs_disk_dquot *ddqp; + struct xfs_dqblk *dqblk; xfs_failaddr_t fa; int error; @@ -1132,30 +1188,23 @@ xfs_qm_dqflush( if (error) goto out_abort; - /* - * Calculate the location of the dquot inside the buffer. - */ - dqb = bp->b_addr + dqp->q_bufoffset; - ddqp = &dqb->dd_diskdq; - - /* sanity check the in-core structure before we flush */ - fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(dqp->q_core.d_id), - 0); + fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", - be32_to_cpu(dqp->q_core.d_id), fa); + dqp->q_id, fa); xfs_buf_relse(bp); error = -EFSCORRUPTED; goto out_abort; } - /* This is the only portion of data that needs to persist */ - memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot)); + /* Flush the incore dquot to the ondisk buffer. */ + dqblk = bp->b_addr + dqp->q_bufoffset; + xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp); /* * Clear the dirty field and remember the flush lsn for later use. */ - dqp->dq_flags &= ~XFS_DQ_DIRTY; + dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, &dqp->q_logitem.qli_item.li_lsn); @@ -1170,17 +1219,17 @@ xfs_qm_dqflush( * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { - dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); - xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } /* - * Attach an iodone routine so that we can remove this dquot from the - * AIL and release the flush lock once the dquot is synced to disk. + * Attach the dquot to the buffer so that we can remove this dquot from + * the AIL and release the flush lock once the dquot is synced to disk. */ - xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, - &dqp->q_logitem.qli_item); + bp->b_flags |= _XBF_DQUOTS; + list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list); /* * If the buffer is pinned then push on the log so we won't @@ -1196,7 +1245,7 @@ xfs_qm_dqflush( return 0; out_abort: - dqp->dq_flags &= ~XFS_DQ_DIRTY; + dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_delete(lip, 0); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); out_unlock: @@ -1217,8 +1266,7 @@ xfs_dqlock2( { if (d1 && d2) { ASSERT(d1 != d2); - if (be32_to_cpu(d1->q_core.d_id) > - be32_to_cpu(d2->q_core.d_id)) { + if (d1->q_id > d2->q_id) { mutex_lock(&d2->q_qlock); mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); } else { @@ -1270,7 +1318,7 @@ xfs_qm_exit(void) int xfs_qm_dqiterate( struct xfs_mount *mp, - uint dqtype, + xfs_dqtype_t type, xfs_qm_dqiterate_fn iter_fn, void *priv) { @@ -1279,16 +1327,15 @@ xfs_qm_dqiterate( int error; do { - error = xfs_qm_dqget_next(mp, id, dqtype, &dq); + error = xfs_qm_dqget_next(mp, id, type, &dq); if (error == -ENOENT) return 0; if (error) return error; - error = iter_fn(dq, dqtype, priv); - id = be32_to_cpu(dq->q_core.d_id); + error = iter_fn(dq, type, priv); + id = dq->q_id; xfs_qm_dqput(dq); - id++; } while (error == 0 && id != 0); return error; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 71e36c85e20b..282a65da93c7 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -27,26 +27,53 @@ enum { XFS_QLOWSP_MAX }; +struct xfs_dquot_res { + /* Total resources allocated and reserved. */ + xfs_qcnt_t reserved; + + /* Total resources allocated. */ + xfs_qcnt_t count; + + /* Absolute and preferred limits. */ + xfs_qcnt_t hardlimit; + xfs_qcnt_t softlimit; + + /* + * For root dquots, this is the default grace period, in seconds. + * Otherwise, this is when the quota grace period expires, + * in seconds since the Unix epoch. + */ + time64_t timer; + + /* + * For root dquots, this is the maximum number of warnings that will + * be issued for this quota type. Otherwise, this is the number of + * warnings issued against this quota. Note that none of this is + * implemented. + */ + xfs_qwarncnt_t warnings; +}; + /* * The incore dquot structure */ struct xfs_dquot { - uint dq_flags; struct list_head q_lru; struct xfs_mount *q_mount; + xfs_dqtype_t q_type; + uint16_t q_flags; + xfs_dqid_t q_id; uint q_nrefs; - xfs_daddr_t q_blkno; int q_bufoffset; + xfs_daddr_t q_blkno; xfs_fileoff_t q_fileoffset; - struct xfs_disk_dquot q_core; + struct xfs_dquot_res q_blk; /* regular blocks */ + struct xfs_dquot_res q_ino; /* inodes */ + struct xfs_dquot_res q_rtb; /* realtime blocks */ + struct xfs_dq_logitem q_logitem; - /* total regular nblks used+reserved */ - xfs_qcnt_t q_res_bcount; - /* total inos allocd+reserved */ - xfs_qcnt_t q_res_icount; - /* total realtime blks used+reserved */ - xfs_qcnt_t q_res_rtbcount; + xfs_qcnt_t q_prealloc_lo_wmark; xfs_qcnt_t q_prealloc_hi_wmark; int64_t q_low_space[XFS_QLOWSP_MAX]; @@ -101,34 +128,59 @@ static inline void xfs_dqunlock(struct xfs_dquot *dqp) mutex_unlock(&dqp->q_qlock); } -static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) +static inline int +xfs_dquot_type(const struct xfs_dquot *dqp) { - switch (type & XFS_DQ_ALLTYPES) { - case XFS_DQ_USER: + return dqp->q_type & XFS_DQTYPE_REC_MASK; +} + +static inline int xfs_this_quota_on(struct xfs_mount *mp, xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: return XFS_IS_UQUOTA_ON(mp); - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return XFS_IS_GQUOTA_ON(mp); - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return XFS_IS_PQUOTA_ON(mp); default: return 0; } } -static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type) +static inline struct xfs_dquot *xfs_inode_dquot( + struct xfs_inode *ip, + xfs_dqtype_t type) { - switch (type & XFS_DQ_ALLTYPES) { - case XFS_DQ_USER: + switch (type) { + case XFS_DQTYPE_USER: return ip->i_udquot; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return ip->i_gdquot; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return ip->i_pdquot; default: return NULL; } } +/* Decide if the dquot's limits are actually being enforced. */ +static inline bool +xfs_dquot_is_enforced( + const struct xfs_dquot *dqp) +{ + switch (xfs_dquot_type(dqp)) { + case XFS_DQTYPE_USER: + return XFS_IS_UQUOTA_ENFORCED(dqp->q_mount); + case XFS_DQTYPE_GROUP: + return XFS_IS_GQUOTA_ENFORCED(dqp->q_mount); + case XFS_DQTYPE_PROJ: + return XFS_IS_PQUOTA_ENFORCED(dqp->q_mount); + } + ASSERT(0); + return false; +} + /* * Check whether a dquot is under low free space conditions. We assume the quota * is enabled and enforced. @@ -137,38 +189,35 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) { int64_t freesp; - freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount; + freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved; if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) return true; return false; } +void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); + #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) -#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) -#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) -#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) -#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) +#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) void xfs_qm_dqdestroy(struct xfs_dquot *dqp); int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); -void xfs_qm_adjust_dqtimers(struct xfs_mount *mp, - struct xfs_dquot *d); -void xfs_qm_adjust_dqlimits(struct xfs_mount *mp, - struct xfs_dquot *d); -xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type); +void xfs_qm_adjust_dqtimers(struct xfs_dquot *d); +void xfs_qm_adjust_dqlimits(struct xfs_dquot *d); +xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, + xfs_dqtype_t type); int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, - uint type, bool can_alloc, - struct xfs_dquot **dqpp); -int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, - bool can_alloc, - struct xfs_dquot **dqpp); + xfs_dqtype_t type, bool can_alloc, + struct xfs_dquot **dqpp); +int xfs_qm_dqget_inode(struct xfs_inode *ip, xfs_dqtype_t type, + bool can_alloc, struct xfs_dquot **dqpp); int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, - uint type, struct xfs_dquot **dqpp); + xfs_dqtype_t type, struct xfs_dquot **dqpp); int xfs_qm_dqget_uncached(struct xfs_mount *mp, - xfs_dqid_t id, uint type, - struct xfs_dquot **dqpp); + xfs_dqid_t id, xfs_dqtype_t type, + struct xfs_dquot **dqpp); void xfs_qm_dqput(struct xfs_dquot *dqp); void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); @@ -183,9 +232,9 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } -typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype, - void *priv); -int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype, +typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, + xfs_dqtype_t type, void *priv); +int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, xfs_qm_dqiterate_fn iter_fn, void *priv); #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 349c92d26570..8c1fdf37ee8f 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -45,6 +45,7 @@ xfs_qm_dquot_logitem_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { + struct xfs_disk_dquot ddq; struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); struct xfs_log_iovec *vecp = NULL; struct xfs_dq_logformat *qlf; @@ -52,14 +53,15 @@ xfs_qm_dquot_logitem_format( qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); qlf->qlf_type = XFS_LI_DQUOT; qlf->qlf_size = 2; - qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id); + qlf->qlf_id = qlip->qli_dquot->q_id; qlf->qlf_blkno = qlip->qli_dquot->q_blkno; qlf->qlf_len = 1; qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, - &qlip->qli_dquot->q_core, + xfs_dquot_to_disk(&ddq, qlip->qli_dquot); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, &ddq, sizeof(struct xfs_disk_dquot)); } @@ -113,23 +115,6 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } -/* - * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer - * have been failed during writeback - * - * this informs the AIL that the dquot is already flush locked on the next push, - * and acquires a hold on the buffer to ensure that it isn't reclaimed before - * dirty data makes it to disk. - */ -STATIC void -xfs_dquot_item_error( - struct xfs_log_item *lip, - struct xfs_buf *bp) -{ - ASSERT(!completion_done(&DQUOT_ITEM(lip)->qli_dquot->q_flush)); - xfs_set_li_failed(lip, bp); -} - STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, @@ -216,7 +201,6 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_release = xfs_qm_dquot_logitem_release, .iop_committing = xfs_qm_dquot_logitem_committing, .iop_push = xfs_qm_dquot_logitem_push, - .iop_error = xfs_dquot_item_error }; /* diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 3400be4c88f0..5875c7e1bd28 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -39,7 +39,7 @@ xlog_recover_dquot_ra_pass2( if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) return; - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + type = recddq->d_type & XFS_DQTYPE_REC_MASK; ASSERT(type); if (log->l_quotaoffs_flag & type) return; @@ -91,7 +91,7 @@ xlog_recover_dquot_commit_pass2( /* * This type of quotas was turned off, so ignore this record. */ - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + type = recddq->d_type & XFS_DQTYPE_REC_MASK; ASSERT(type); if (log->l_quotaoffs_flag & type) return 0; @@ -108,7 +108,7 @@ xlog_recover_dquot_commit_pass2( */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0); + fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", dq_f->qlf_id, fa); @@ -153,7 +153,7 @@ xlog_recover_dquot_commit_pass2( ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; + bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_delwri_queue(bp, buffer_list); out_release: @@ -185,11 +185,11 @@ xlog_recover_quotaoff_commit_pass1( * group/project quotaoff or both. */ if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_USER; + log->l_quotaoffs_flag |= XFS_DQTYPE_USER; if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_PROJ; + log->l_quotaoffs_flag |= XFS_DQTYPE_PROJ; if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_GROUP; + log->l_quotaoffs_flag |= XFS_DQTYPE_GROUP; return 0; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index b9c333bae0a1..6cb8cd11072a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -161,7 +161,8 @@ xfs_efi_init( ((nextents - 1) * sizeof(xfs_extent_t))); efip = kmem_zalloc(size, 0); } else { - efip = kmem_zone_zalloc(xfs_efi_zone, 0); + efip = kmem_cache_zalloc(xfs_efi_zone, + GFP_KERNEL | __GFP_NOFAIL); } xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); @@ -332,7 +333,8 @@ xfs_trans_get_efd( (nextents - 1) * sizeof(struct xfs_extent), 0); } else { - efdp = kmem_zone_zalloc(xfs_efd_zone, 0); + efdp = kmem_cache_zalloc(xfs_efd_zone, + GFP_KERNEL | __GFP_NOFAIL); } xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d538411c3791..c31cd3be9fb2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -94,6 +94,7 @@ xfs_file_fsync( { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_mount *mp = ip->i_mount; int error = 0; int log_flushed = 0; @@ -137,13 +138,15 @@ xfs_file_fsync( xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { if (!datasync || - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - lsn = ip->i_itemp->ili_last_lsn; + (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + lsn = iip->ili_last_lsn; } if (lsn) { error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); - ip->i_itemp->ili_fsync_fields = 0; + spin_lock(&iip->ili_lock); + iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); } xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -1035,7 +1038,7 @@ xfs_file_remap_range( /* Prepare and then clone file data. */ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, &len, remap_flags); - if (ret < 0 || len == 0) + if (ret || len == 0) return ret; trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); @@ -1065,7 +1068,7 @@ xfs_file_remap_range( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_log_force_inode(dest); out_unlock: - xfs_reflink_remap_unlock(file_in, file_out); + xfs_iunlock2_io_mmap(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return remapped > 0 ? remapped : ret; @@ -1263,10 +1266,23 @@ xfs_filemap_pfn_mkwrite( return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } +static void +xfs_filemap_map_pages( + struct vm_fault *vmf, + pgoff_t start_pgoff, + pgoff_t end_pgoff) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + filemap_map_pages(vmf, start_pgoff, end_pgoff); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); +} + static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, .huge_fault = xfs_filemap_huge_fault, - .map_pages = filemap_map_pages, + .map_pages = xfs_filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, .pfn_mkwrite = xfs_filemap_pfn_mkwrite, }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5daef654956c..101028ebb571 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -37,13 +37,11 @@ xfs_inode_alloc( struct xfs_inode *ip; /* - * if this didn't occur in transactions, we could use - * KM_MAYFAIL and return NULL here on ENOMEM. Set the - * code up to do this anyway. + * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL + * and return NULL here on ENOMEM. */ - ip = kmem_zone_alloc(xfs_inode_zone, 0); - if (!ip) - return NULL; + ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL); + if (inode_init_always(mp->m_super, VFS_I(ip))) { kmem_cache_free(xfs_inode_zone, ip); return NULL; @@ -115,6 +113,7 @@ __xfs_inode_free( { /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); XFS_STATS_DEC(ip->i_mount, vn_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); @@ -141,11 +140,8 @@ xfs_inode_free( } /* - * Queue a new inode reclaim pass if there are reclaimable inodes and there - * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs periodic sync default of 30s. Perhaps this should have it's own - * tunable, but that can be done if this method proves to be ineffective or too - * aggressive. + * Queue background inode reclaim work if there are reclaimable inodes and there + * isn't reclaim work already scheduled or in progress. */ static void xfs_reclaim_work_queue( @@ -160,24 +156,6 @@ xfs_reclaim_work_queue( rcu_read_unlock(); } -/* - * This is a fast pass over the inode cache to try to get reclaim moving on as - * many inodes as possible in a short period of time. It kicks itself every few - * seconds, as well as being kicked by the inode cache shrinker when memory - * goes low. It scans as quickly as possible avoiding locked inodes or those - * already being flushed, and once done schedules a future pass. - */ -void -xfs_reclaim_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_reclaim_work); - - xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_reclaim_work_queue(mp); -} - static void xfs_perag_set_reclaim_tag( struct xfs_perag *pag) @@ -618,48 +596,31 @@ out_destroy: } /* - * Look up an inode by number in the given file system. - * The inode is looked up in the cache held in each AG. - * If the inode is found in the cache, initialise the vfs inode - * if necessary. + * Look up an inode by number in the given file system. The inode is looked up + * in the cache held in each AG. If the inode is found in the cache, initialise + * the vfs inode if necessary. * - * If it is not in core, read it in from the file system's device, - * add it to the cache and initialise the vfs inode. + * If it is not in core, read it in from the file system's device, add it to the + * cache and initialise the vfs inode. * * The inode is locked according to the value of the lock_flags parameter. - * This flag parameter indicates how and if the inode's IO lock and inode lock - * should be taken. - * - * mp -- the mount point structure for the current file system. It points - * to the inode hash table. - * tp -- a pointer to the current transaction if there is one. This is - * simply passed through to the xfs_iread() call. - * ino -- the number of the inode desired. This is the unique identifier - * within the file system for the inode being requested. - * lock_flags -- flags indicating how to lock the inode. See the comment - * for xfs_ilock() for a list of valid values. + * Inode lookup is only done during metadata operations and not as part of the + * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. */ int xfs_iget( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp) + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + struct xfs_inode **ipp) { - xfs_inode_t *ip; - int error; - xfs_perag_t *pag; - xfs_agino_t agino; + struct xfs_inode *ip; + struct xfs_perag *pag; + xfs_agino_t agino; + int error; - /* - * xfs_reclaim_inode() uses the ILOCK to ensure an inode - * doesn't get freed while it's being referenced during a - * radix tree traversal here. It assumes this function - * aqcuires only the ILOCK (and therefore it has no need to - * involve the IOLOCK in this synchronization). - */ ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); /* reject inode numbers outside existing AGs */ @@ -776,15 +737,7 @@ xfs_inode_walk_ag_grab( ASSERT(rcu_read_lock_held()); - /* - * check for stale RCU freed inode - * - * If the inode has been reallocated, it doesn't matter if it's not in - * the AG we are walking - we are walking for writeback, so if it - * passes all the "valid inode" checks and is dirty, then we'll write - * it back anyway. If it has been reallocated and still being - * initialised, the XFS_INEW check below will catch it. - */ + /* Check for stale RCU freed inode */ spin_lock(&ip->i_flags_lock); if (!ip->i_ino) goto out_unlock_noent; @@ -1028,107 +981,62 @@ xfs_cowblocks_worker( /* * Grab the inode for reclaim exclusively. - * Return 0 if we grabbed it, non-zero otherwise. + * + * We have found this inode via a lookup under RCU, so the inode may have + * already been freed, or it may be in the process of being recycled by + * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode + * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE + * will not be set. Hence we need to check for both these flag conditions to + * avoid inodes that are no longer reclaim candidates. + * + * Note: checking for other state flags here, under the i_flags_lock or not, is + * racy and should be avoided. Those races should be resolved only after we have + * ensured that we are able to reclaim this inode and the world can see that we + * are going to reclaim it. + * + * Return true if we grabbed it, false otherwise. */ -STATIC int +static bool xfs_reclaim_inode_grab( - struct xfs_inode *ip, - int flags) + struct xfs_inode *ip) { ASSERT(rcu_read_lock_held()); - /* quick check for stale RCU freed inode */ - if (!ip->i_ino) - return 1; - - /* - * If we are asked for non-blocking operation, do unlocked checks to - * see if the inode already is being flushed or in reclaim to avoid - * lock traffic. - */ - if ((flags & SYNC_TRYLOCK) && - __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) - return 1; - - /* - * The radix tree lock here protects a thread in xfs_iget from racing - * with us starting reclaim on the inode. Once we have the - * XFS_IRECLAIM flag set it will not touch us. - * - * Due to RCU lookup, we may find inodes that have been freed and only - * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that - * aren't candidates for reclaim at all, so we must check the - * XFS_IRECLAIMABLE is set first before proceeding to reclaim. - */ spin_lock(&ip->i_flags_lock); if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || __xfs_iflags_test(ip, XFS_IRECLAIM)) { /* not a reclaim candidate. */ spin_unlock(&ip->i_flags_lock); - return 1; + return false; } __xfs_iflags_set(ip, XFS_IRECLAIM); spin_unlock(&ip->i_flags_lock); - return 0; + return true; } /* - * Inodes in different states need to be treated differently. The following - * table lists the inode states and the reclaim actions necessary: - * - * inode state iflush ret required action - * --------------- ---------- --------------- - * bad - reclaim - * shutdown EIO unpin and reclaim - * clean, unpinned 0 reclaim - * stale, unpinned 0 reclaim - * clean, pinned(*) 0 requeue - * stale, pinned EAGAIN requeue - * dirty, async - requeue - * dirty, sync 0 reclaim + * Inode reclaim is non-blocking, so the default action if progress cannot be + * made is to "requeue" the inode for reclaim by unlocking it and clearing the + * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about + * blocking anymore and hence we can wait for the inode to be able to reclaim + * it. * - * (*) dgc: I don't think the clean, pinned state is possible but it gets - * handled anyway given the order of checks implemented. - * - * Also, because we get the flush lock first, we know that any inode that has - * been flushed delwri has had the flush completed by the time we check that - * the inode is clean. - * - * Note that because the inode is flushed delayed write by AIL pushing, the - * flush lock may already be held here and waiting on it can result in very - * long latencies. Hence for sync reclaims, where we wait on the flush lock, - * the caller should push the AIL first before trying to reclaim inodes to - * minimise the amount of time spent waiting. For background relaim, we only - * bother to reclaim clean inodes anyway. - * - * Hence the order of actions after gaining the locks should be: - * bad => reclaim - * shutdown => unpin and reclaim - * pinned, async => requeue - * pinned, sync => unpin - * stale => reclaim - * clean => reclaim - * dirty, async => requeue - * dirty, sync => flush, wait and reclaim + * We do no IO here - if callers require inodes to be cleaned they must push the + * AIL first to trigger writeback of dirty inodes. This enables writeback to be + * done in the background in a non-blocking manner, and enables memory reclaim + * to make progress without blocking. */ -STATIC int +static void xfs_reclaim_inode( struct xfs_inode *ip, - struct xfs_perag *pag, - int sync_mode) + struct xfs_perag *pag) { - struct xfs_buf *bp = NULL; xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ - int error; -restart: - error = 0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!xfs_iflock_nowait(ip)) { - if (!(sync_mode & SYNC_WAIT)) - goto out; - xfs_iflock(ip); - } + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + goto out; + if (!xfs_iflock_nowait(ip)) + goto out_iunlock; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); @@ -1136,52 +1044,12 @@ restart: xfs_iflush_abort(ip); goto reclaim; } - if (xfs_ipincount(ip)) { - if (!(sync_mode & SYNC_WAIT)) - goto out_ifunlock; - xfs_iunpin_wait(ip); - } - if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - goto reclaim; - } - - /* - * Never flush out dirty data during non-blocking reclaim, as it would - * just contend with AIL pushing trying to do the same job. - */ - if (!(sync_mode & SYNC_WAIT)) + if (xfs_ipincount(ip)) + goto out_ifunlock; + if (!xfs_inode_clean(ip)) goto out_ifunlock; - /* - * Now we have an inode that needs flushing. - * - * Note that xfs_iflush will never block on the inode buffer lock, as - * xfs_ifree_cluster() can lock the inode buffer before it locks the - * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_imap_to_bp() to get the cluster buffer would - * result in an ABBA deadlock with xfs_ifree_cluster(). - * - * As xfs_ifree_cluser() must gather all inodes that are active in the - * cache to mark them stale, if we hit this case we don't actually want - * to do IO here - we want the inode marked stale so we can simply - * reclaim it. Hence if we get an EAGAIN error here, just unlock the - * inode, back off and try again. Hopefully the next pass through will - * see the stale flag set on the inode. - */ - error = xfs_iflush(ip, &bp); - if (error == -EAGAIN) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* backoff longer than in xfs_ifree_cluster */ - delay(2); - goto restart; - } - - if (!error) { - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - } - + xfs_ifunlock(ip); reclaim: ASSERT(!xfs_isiflocked(ip)); @@ -1228,23 +1096,17 @@ reclaim: xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); + ASSERT(xfs_inode_clean(ip)); __xfs_inode_free(ip); - return error; + return; out_ifunlock: xfs_ifunlock(ip); +out_iunlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); out: xfs_iflags_clear(ip, XFS_IRECLAIM); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * We could return -EAGAIN here to make reclaim rescan the inode tree in - * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and the reclaim work never goes back to - * the idle state. Instead, return 0 to let the next scheduled - * background reclaim attempt to reclaim the inode again. - */ - return 0; } /* @@ -1252,23 +1114,19 @@ out: * corrupted, we still want to try to reclaim all the inodes. If we don't, * then a shut down during filesystem unmount reclaim walk leak all the * unreclaimed inodes. + * + * Returns non-zero if any AGs or inodes were skipped in the reclaim pass + * so that callers that want to block until all dirty inodes are written back + * and reclaimed can sanely loop. */ -STATIC int +static void xfs_reclaim_inodes_ag( struct xfs_mount *mp, - int flags, int *nr_to_scan) { struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - int trylock = flags & SYNC_TRYLOCK; - int skipped; + xfs_agnumber_t ag = 0; -restart: - ag = 0; - skipped = 0; while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { unsigned long first_index = 0; int done = 0; @@ -1276,16 +1134,7 @@ restart: ag = pag->pag_agno + 1; - if (trylock) { - if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { - skipped++; - xfs_perag_put(pag); - continue; - } - first_index = pag->pag_ici_reclaim_cursor; - } else - mutex_lock(&pag->pag_ici_reclaim_lock); - + first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; int i; @@ -1309,7 +1158,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_reclaim_inode_grab(ip, flags)) + if (done || !xfs_reclaim_inode_grab(ip)) batch[i] = NULL; /* @@ -1338,59 +1187,39 @@ restart: rcu_read_unlock(); for (i = 0; i < nr_found; i++) { - if (!batch[i]) - continue; - error = xfs_reclaim_inode(batch[i], pag, flags); - if (error && last_error != -EFSCORRUPTED) - last_error = error; + if (batch[i]) + xfs_reclaim_inode(batch[i], pag); } *nr_to_scan -= XFS_LOOKUP_BATCH; - cond_resched(); - } while (nr_found && !done && *nr_to_scan > 0); - if (trylock && !done) - pag->pag_ici_reclaim_cursor = first_index; - else - pag->pag_ici_reclaim_cursor = 0; - mutex_unlock(&pag->pag_ici_reclaim_lock); + if (done) + first_index = 0; + WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); xfs_perag_put(pag); } - - /* - * if we skipped any AG, and we still have scan count remaining, do - * another pass this time using blocking reclaim semantics (i.e - * waiting on the reclaim locks and ignoring the reclaim cursors). This - * ensure that when we get more reclaimers than AGs we block rather - * than spin trying to execute reclaim. - */ - if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { - trylock = 0; - goto restart; - } - return last_error; } -int +void xfs_reclaim_inodes( - xfs_mount_t *mp, - int mode) + struct xfs_mount *mp) { int nr_to_scan = INT_MAX; - return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); + while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + xfs_ail_push_all_sync(mp->m_ail); + xfs_reclaim_inodes_ag(mp, &nr_to_scan); + }; } /* - * Scan a certain number of inodes for reclaim. - * - * When called we make sure that there is a background (fast) inode reclaim in - * progress, while we will throttle the speed of reclaim via doing synchronous - * reclaim of inodes. That means if we come across dirty inodes, we wait for - * them to be cleaned, which we hope will not be very long due to the - * background walker having already kicked the IO off on those dirty inodes. + * The shrinker infrastructure determines how many inodes we should scan for + * reclaim. We want as many clean inodes ready to reclaim as possible, so we + * push the AIL here. We also want to proactively free up memory if we can to + * minimise the amount of work memory reclaim has to do so we kick the + * background reclaim if it isn't already scheduled. */ long xfs_reclaim_inodes_nr( @@ -1401,7 +1230,8 @@ xfs_reclaim_inodes_nr( xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); + xfs_reclaim_inodes_ag(mp, &nr_to_scan); + return 0; } /* @@ -1498,6 +1328,24 @@ xfs_inode_matches_eofb( return true; } +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + int nr_to_scan = INT_MAX; + + xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_reclaim_work_queue(mp); +} + STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, @@ -1574,7 +1422,7 @@ __xfs_inode_free_quota_eofblocks( eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQ_USER); + dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); if (dq && xfs_dquot_lowsp(dq)) { eofb.eof_uid = VFS_I(ip)->i_uid; eofb.eof_flags |= XFS_EOF_FLAGS_UID; @@ -1583,7 +1431,7 @@ __xfs_inode_free_quota_eofblocks( } if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); + dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); if (dq && xfs_dquot_lowsp(dq)) { eofb.eof_gid = VFS_I(ip)->i_gid; eofb.eof_flags |= XFS_EOF_FLAGS_GID; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 93b54e7d55f0..3a4c8b382cd0 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -17,9 +17,6 @@ struct xfs_eofblocks { __u64 eof_min_file_size; }; -#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ -#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ - /* * tags for inode radix tree */ @@ -51,7 +48,7 @@ void xfs_inode_free(struct xfs_inode *ip); void xfs_reclaim_worker(struct work_struct *work); -int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); +void xfs_reclaim_inodes(struct xfs_mount *mp); int xfs_reclaim_inodes_count(struct xfs_mount *mp); long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 287a9e5c7d75..9b3994b9c716 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -97,7 +97,7 @@ xfs_icreate_log( { struct xfs_icreate_item *icp; - icp = kmem_zone_zalloc(xfs_icreate_zone, 0); + icp = kmem_cache_zalloc(xfs_icreate_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, &xfs_icreate_item_ops); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9aea7d68d8ab..407d6299606d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -44,7 +44,6 @@ kmem_zone_t *xfs_inode_zone; */ #define XFS_ITRUNC_MAX_EXTENTS 2 -STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *); STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); @@ -1740,10 +1739,31 @@ xfs_inactive_ifree( return error; } + /* + * We do not hold the inode locked across the entire rolling transaction + * here. We only need to hold it for the first transaction that + * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the + * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode + * here breaks the relationship between cluster buffer invalidation and + * stale inode invalidation on cluster buffer item journal commit + * completion, and can result in leaving dirty stale inodes hanging + * around in memory. + * + * We have no need for serialising this inode operation against other + * operations - we freed the inode and hence reallocation is required + * and that will serialise on reallocating the space the deferops need + * to free. Hence we can unlock the inode on the first commit of + * the transaction rather than roll it right through the deferops. This + * avoids relogging the XFS_ISTALE inode. + * + * We check that xfs_ifree() hasn't grown an internal transaction roll + * by asserting that the inode is still locked when it returns. + */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_ifree(tp, ip); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (error) { /* * If we fail to free the inode, shut down. The cancel @@ -1756,7 +1776,6 @@ xfs_inactive_ifree( xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); } xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1774,7 +1793,6 @@ xfs_inactive_ifree( xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; } @@ -2147,7 +2165,6 @@ xfs_iunlink_update_dinode( xfs_dinode_calc_crc(mp, dip); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); - xfs_inobp_check(mp, ibp); } /* Set an in-core inode's unlinked pointer and return the old value. */ @@ -2248,7 +2265,6 @@ xfs_iunlink( } if (next_agino != NULLAGINO) { - struct xfs_perag *pag; xfs_agino_t old_agino; /* @@ -2265,9 +2281,7 @@ xfs_iunlink( * agino has been unlinked, add a backref from the next inode * back to agino. */ - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_add_backref(pag, agino, next_agino); - xfs_perag_put(pag); + error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); if (error) return error; } @@ -2403,7 +2417,6 @@ xfs_iunlink_remove( struct xfs_buf *agibp; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; - struct xfs_perag *pag = NULL; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; @@ -2447,32 +2460,22 @@ xfs_iunlink_remove( * this inode's backref to point from the next inode. */ if (next_agino != NULLAGINO) { - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_change_backref(pag, next_agino, + error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, NULLAGINO); if (error) - goto out; + return error; } - if (head_agino == agino) { - /* Point the head of the list to the next unlinked inode. */ - error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, - next_agino); - if (error) - goto out; - } else { + if (head_agino != agino) { struct xfs_imap imap; xfs_agino_t prev_agino; - if (!pag) - pag = xfs_perag_get(mp, agno); - /* We need to search the list for the inode being freed. */ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, &prev_agino, &imap, &last_dip, &last_ibp, - pag); + agibp->b_pag); if (error) - goto out; + return error; /* Point the previous inode on the list to the next inode. */ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, @@ -2486,29 +2489,29 @@ xfs_iunlink_remove( * change_backref takes care of deleting the backref if * next_agino is NULLAGINO. */ - error = xfs_iunlink_change_backref(pag, agino, next_agino); - if (error) - goto out; + return xfs_iunlink_change_backref(agibp->b_pag, agino, + next_agino); } -out: - if (pag) - xfs_perag_put(pag); - return error; + /* Point the head of the list to the next unlinked inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); } /* - * Look up the inode number specified and mark it stale if it is found. If it is - * dirty, return the inode so it can be attached to the cluster buffer so it can - * be processed appropriately when the cluster free transaction completes. + * Look up the inode number specified and if it is not already marked XFS_ISTALE + * mark it stale. We should only find clean inodes in this lookup that aren't + * already stale. */ -static struct xfs_inode * -xfs_ifree_get_one_inode( - struct xfs_perag *pag, +static void +xfs_ifree_mark_inode_stale( + struct xfs_buf *bp, struct xfs_inode *free_ip, xfs_ino_t inum) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = bp->b_mount; + struct xfs_perag *pag = bp->b_pag; + struct xfs_inode_log_item *iip; struct xfs_inode *ip; retry: @@ -2516,8 +2519,10 @@ retry: ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); /* Inode not in memory, nothing to do */ - if (!ip) - goto out_rcu_unlock; + if (!ip) { + rcu_read_unlock(); + return; + } /* * because this is an RCU protected lookup, we could find a recently @@ -2528,9 +2533,9 @@ retry: spin_lock(&ip->i_flags_lock); if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { spin_unlock(&ip->i_flags_lock); - goto out_rcu_unlock; + rcu_read_unlock(); + return; } - spin_unlock(&ip->i_flags_lock); /* * Don't try to lock/unlock the current inode, but we _cannot_ skip the @@ -2540,43 +2545,50 @@ retry: */ if (ip != free_ip) { if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); delay(1); goto retry; } - - /* - * Check the inode number again in case we're racing with - * freeing in xfs_reclaim_inode(). See the comments in that - * function for more information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_rcu_unlock; - } } + ip->i_flags |= XFS_ISTALE; + spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); + /* + * If we can't get the flush lock, the inode is already attached. All + * we needed to do here is mark the inode stale so buffer IO completion + * will remove it from the AIL. + */ + iip = ip->i_itemp; + if (!xfs_iflock_nowait(ip)) { + ASSERT(!list_empty(&iip->ili_item.li_bio_list)); + ASSERT(iip->ili_last_fields); + goto out_iunlock; + } /* - * We don't need to attach clean inodes or those only with unlogged - * changes (which we throw away, anyway). + * Inodes not attached to the buffer can be released immediately. + * Everything else has to go through xfs_iflush_abort() on journal + * commit as the flock synchronises removal of the inode from the + * cluster buffer against inode reclaim. */ - if (!ip->i_itemp || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); + if (!iip || list_empty(&iip->ili_item.li_bio_list)) { xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_no_inode; + goto out_iunlock; } - return ip; -out_rcu_unlock: - rcu_read_unlock(); -out_no_inode: - return NULL; + /* we have a dirty inode in memory that has not yet been flushed. */ + spin_lock(&iip->ili_lock); + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); + ASSERT(iip->ili_last_fields); + +out_iunlock: + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); } /* @@ -2586,26 +2598,20 @@ out_no_inode: */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, + struct xfs_inode *free_ip, + struct xfs_trans *tp, struct xfs_icluster *xic) { - xfs_mount_t *mp = free_ip->i_mount; + struct xfs_mount *mp = free_ip->i_mount; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + struct xfs_buf *bp; + xfs_daddr_t blkno; + xfs_ino_t inum = xic->first_ino; int nbufs; int i, j; int ioffset; - xfs_daddr_t blkno; - xfs_buf_t *bp; - xfs_inode_t *ip; - struct xfs_inode_log_item *iip; - struct xfs_log_item *lip; - struct xfs_perag *pag; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - xfs_ino_t inum; int error; - inum = xic->first_ino; - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { @@ -2634,10 +2640,8 @@ xfs_ifree_cluster( error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, mp->m_bsize * igeo->blocks_per_cluster, XBF_UNMAPPED, &bp); - if (error) { - xfs_perag_put(pag); + if (error) return error; - } /* * This buffer may not have been correctly initialised as we @@ -2651,60 +2655,16 @@ xfs_ifree_cluster( bp->b_ops = &xfs_inode_buf_ops; /* - * Walk the inodes already attached to the buffer and mark them - * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. By marking them all - * stale first, we will not attempt to lock them in the loop - * below as the XFS_ISTALE flag will be set. + * Now we need to set all the cached clean inodes as XFS_ISTALE, + * too. This requires lookups, and will skip inodes that we've + * already marked XFS_ISTALE. */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (lip->li_type == XFS_LI_INODE) { - iip = (struct xfs_inode_log_item *)lip; - ASSERT(iip->ili_logged == 1); - lip->li_cb = xfs_istale_done; - xfs_trans_ail_copy_lsn(mp->m_ail, - &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - } - } - - - /* - * For each inode in memory attempt to add it to the inode - * buffer and set it up for being staled on buffer IO - * completion. This is safe as we've locked out tail pushing - * and flushing by locking the buffer. - * - * We have already marked every inode that was part of a - * transaction stale above, which means there is no point in - * even trying to lock them. - */ - for (i = 0; i < igeo->inodes_per_cluster; i++) { - ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i); - if (!ip) - continue; - - iip = ip->i_itemp; - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - iip->ili_logged = 1; - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - - xfs_buf_attach_iodone(bp, xfs_istale_done, - &iip->ili_item); - - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + for (i = 0; i < igeo->inodes_per_cluster; i++) + xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } - - xfs_perag_put(pag); return 0; } @@ -2725,6 +2685,7 @@ xfs_ifree( { int error; struct xfs_icluster xic = { 0 }; + struct xfs_inode_log_item *iip = ip->i_itemp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); @@ -2762,7 +2723,9 @@ xfs_ifree( ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; /* Don't attempt to replay owner changes for a deleted inode */ - ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); + spin_lock(&iip->ili_lock); + iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); + spin_unlock(&iip->ili_lock); /* * Bump the generation count so no one will be confused @@ -3469,232 +3432,9 @@ out_release_wip: return error; } -STATIC int -xfs_iflush_cluster( - struct xfs_inode *ip, - struct xfs_buf *bp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - unsigned long first_index, mask; - int cilist_size; - struct xfs_inode **cilist; - struct xfs_inode *cip; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - int error = 0; - int nr_found; - int clcount = 0; - int i; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - - cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); - cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); - if (!cilist) - goto out_put; - - mask = ~(igeo->inodes_per_cluster - 1); - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; - rcu_read_lock(); - /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, - first_index, igeo->inodes_per_cluster); - if (nr_found == 0) - goto out_free; - - for (i = 0; i < nr_found; i++) { - cip = cilist[i]; - if (cip == ip) - continue; - - /* - * because this is an RCU protected lookup, we could find a - * recently freed or even reallocated inode during the lookup. - * We need to check under the i_flags_lock for a valid inode - * here. Skip it if it is not valid or the wrong inode. - */ - spin_lock(&cip->i_flags_lock); - if (!cip->i_ino || - __xfs_iflags_test(cip, XFS_ISTALE)) { - spin_unlock(&cip->i_flags_lock); - continue; - } - - /* - * Once we fall off the end of the cluster, no point checking - * any more inodes in the list because they will also all be - * outside the cluster. - */ - if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { - spin_unlock(&cip->i_flags_lock); - break; - } - spin_unlock(&cip->i_flags_lock); - - /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. - */ - if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) - continue; - - /* - * Try to get locks. If any are unavailable or it is pinned, - * then this inode cannot be flushed and is skipped. - */ - - if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) - continue; - if (!xfs_iflock_nowait(cip)) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - if (xfs_ipincount(cip)) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - - /* - * Check the inode number again, just to be certain we are not - * racing with freeing in xfs_reclaim_inode(). See the comments - * in that function for more information as to why the initial - * check is not sufficient. - */ - if (!cip->i_ino) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - /* - * arriving here means that this inode can be flushed. First - * re-check that it's dirty before flushing. - */ - if (!xfs_inode_clean(cip)) { - error = xfs_iflush_int(cip, bp); - if (error) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto out_free; - } - clcount++; - } else { - xfs_ifunlock(cip); - } - xfs_iunlock(cip, XFS_ILOCK_SHARED); - } - - if (clcount) { - XFS_STATS_INC(mp, xs_icluster_flushcnt); - XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); - } - -out_free: - rcu_read_unlock(); - kmem_free(cilist); -out_put: - xfs_perag_put(pag); - return error; -} - -/* - * Flush dirty inode metadata into the backing buffer. - * - * The caller must have the inode lock and the inode flush lock held. The - * inode lock will still be held upon return to the caller, and the inode - * flush lock will be released after the inode has reached the disk. - * - * The caller must write out the buffer returned in *bpp and release it. - */ -int +static int xfs_iflush( struct xfs_inode *ip, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - int error; - - XFS_STATS_INC(mp, xs_iflush_count); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || - ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - *bpp = NULL; - - xfs_iunpin_wait(ip); - - /* - * For stale inodes we cannot rely on the backing buffer remaining - * stale in cache for the remaining life of the stale inode and so - * xfs_imap_to_bp() below may give us a buffer that no longer contains - * inodes below. We have to check this after ensuring the inode is - * unpinned so that it is safe to reclaim the stale inode after the - * flush call. - */ - if (xfs_iflags_test(ip, XFS_ISTALE)) { - xfs_ifunlock(ip); - return 0; - } - - /* - * Get the buffer containing the on-disk inode. We are doing a try-lock - * operation here, so we may get an EAGAIN error. In that case, return - * leaving the inode dirty. - * - * If we get any other error, we effectively have a corruption situation - * and we cannot flush the inode. Abort the flush and shut down. - */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK); - if (error == -EAGAIN) { - xfs_ifunlock(ip); - return error; - } - if (error) - goto abort; - - /* - * If the buffer is pinned then push on the log now so we won't - * get stuck waiting in the write for too long. - */ - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - - /* - * Flush the provided inode then attempt to gather others from the - * cluster into the write. - * - * Note: Once we attempt to flush an inode, we must run buffer - * completion callbacks on any failure. If this fails, simulate an I/O - * failure on the buffer and shut down. - */ - error = xfs_iflush_int(ip, bp); - if (!error) - error = xfs_iflush_cluster(ip, bp); - if (error) { - bp->b_flags |= XBF_ASYNC; - xfs_buf_ioend_fail(bp); - goto shutdown; - } - - *bpp = bp; - return 0; - -abort: - xfs_iflush_abort(ip); -shutdown: - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; -} - -STATIC int -xfs_iflush_int( - struct xfs_inode *ip, struct xfs_buf *bp) { struct xfs_inode_log_item *iip = ip->i_itemp; @@ -3706,7 +3446,7 @@ xfs_iflush_int( ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(iip->ili_item.li_buf == bp); dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -3801,7 +3541,6 @@ xfs_iflush_int( xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); - xfs_inobp_check(mp, bp); /* * We've recorded everything logged in the inode, so we'd like to clear @@ -3818,40 +3557,148 @@ xfs_iflush_int( * know that the information those bits represent is permanently on * disk. As long as the flush completes before the inode is logged * again, then both ili_fields and ili_last_fields will be cleared. - * - * We can play with the ili_fields bits here, because the inode lock - * must be held exclusively in order to set bits there and the flush - * lock protects the ili_last_fields bits. Set ili_logged so the flush - * done routine can tell whether or not to look in the AIL. Also, store - * the current LSN of the inode so that we can tell whether the item has - * moved in the AIL from xfs_iflush_done(). In order to read the lsn we - * need the AIL lock, because it is a 64 bit value that cannot be read - * atomically. */ error = 0; flush_out: + spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; - iip->ili_logged = 1; + spin_unlock(&iip->ili_lock); + /* + * Store the current LSN of the inode so that we can tell whether the + * item has moved in the AIL from xfs_iflush_done(). + */ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); + return error; +} + +/* + * Non-blocking flush of dirty inode metadata into the backing buffer. + * + * The caller must have a reference to the inode and hold the cluster buffer + * locked. The function will walk across all the inodes on the cluster buffer it + * can find and lock without blocking, and flush them to the cluster buffer. + * + * On successful flushing of at least one inode, the caller must write out the + * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and + * the caller needs to release the buffer. On failure, the filesystem will be + * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED + * will be returned. + */ +int +xfs_iflush_cluster( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_log_item *lip, *n; + struct xfs_inode *ip; + struct xfs_inode_log_item *iip; + int clcount = 0; + int error = 0; + /* - * Attach the inode item callback to the buffer whether the flush - * succeeded or not. If not, the caller will shut down and fail I/O - * completion on the buffer to remove the inode from the AIL and release - * the flush lock. + * We must use the safe variant here as on shutdown xfs_iflush_abort() + * can remove itself from the list. */ - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + iip = (struct xfs_inode_log_item *)lip; + ip = iip->ili_inode; - /* generate the checksum. */ - xfs_dinode_calc_crc(mp, dip); + /* + * Quick and dirty check to avoid locks if possible. + */ + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + continue; + if (xfs_ipincount(ip)) + continue; + + /* + * The inode is still attached to the buffer, which means it is + * dirty but reclaim might try to grab it. Check carefully for + * that, and grab the ilock while still holding the i_flags_lock + * to guarantee reclaim will not be able to reclaim this inode + * once we drop the i_flags_lock. + */ + spin_lock(&ip->i_flags_lock); + ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { + spin_unlock(&ip->i_flags_lock); + continue; + } + + /* + * ILOCK will pin the inode against reclaim and prevent + * concurrent transactions modifying the inode while we are + * flushing the inode. + */ + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + spin_unlock(&ip->i_flags_lock); + continue; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Skip inodes that are already flush locked as they have + * already been written to the buffer. + */ + if (!xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; + } + + /* + * Abort flushing this inode if we are shut down because the + * inode may not currently be in the AIL. This can occur when + * log I/O failure unpins the inode without inserting into the + * AIL, leaving a dirty/unpinned inode attached to the buffer + * that otherwise looks like it should be flushed. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_iunpin_wait(ip); + /* xfs_iflush_abort() drops the flush lock */ + xfs_iflush_abort(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + error = -EIO; + continue; + } + + /* don't block waiting on a log force to unpin dirty inodes */ + if (xfs_ipincount(ip)) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; + } + + if (!xfs_inode_clean(ip)) + error = xfs_iflush(ip, bp); + else + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) + break; + clcount++; + } + + if (error) { + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + if (!clcount) + return -EAGAIN; + + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); + return 0; - ASSERT(!list_empty(&bp->b_li_list)); - ASSERT(bp->b_iodone != NULL); - return error; } /* Release an inode. */ @@ -3881,3 +3728,96 @@ xfs_log_force_inode( return 0; return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); } + +/* + * Grab the exclusive iolock for a data copy from src to dest, making sure to + * abide vfs locking order (lowest pointer value goes first) and breaking the + * layout leases before proceeding. The loop is needed because we cannot call + * the blocking break_layout() with the iolocks held, and therefore have to + * back out both locks. + */ +static int +xfs_iolock_two_inodes_and_break_layout( + struct inode *src, + struct inode *dest) +{ + int error; + + if (src > dest) + swap(src, dest); + +retry: + /* Wait to break both inodes' layouts before we start locking. */ + error = break_layout(src, true); + if (error) + return error; + if (src != dest) { + error = break_layout(dest, true); + if (error) + return error; + } + + /* Lock one inode and make sure nobody got in and leased it. */ + inode_lock(src); + error = break_layout(src, false); + if (error) { + inode_unlock(src); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + if (src == dest) + return 0; + + /* Lock the other inode and make sure nobody got in and leased it. */ + inode_lock_nested(dest, I_MUTEX_NONDIR2); + error = break_layout(dest, false); + if (error) { + inode_unlock(src); + inode_unlock(dest); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + return 0; +} + +/* + * Lock two inodes so that userspace cannot initiate I/O via file syscalls or + * mmap activity. + */ +int +xfs_ilock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + int ret; + + ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); + if (ret) + return ret; + if (ip1 == ip2) + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); + else + xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, + ip2, XFS_MMAPLOCK_EXCL); + return 0; +} + +/* Unlock both inodes to allow IO and mmap activity. */ +void +xfs_iunlock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + bool same_inode = (ip1 == ip2); + + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + inode_unlock(VFS_I(ip2)); + if (!same_inode) + inode_unlock(VFS_I(ip1)); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 47d3b391030d..e9a8bb184d1f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -426,7 +426,7 @@ int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) -int xfs_iflush(struct xfs_inode *, struct xfs_buf **); +int xfs_iflush_cluster(struct xfs_buf *); void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); @@ -499,4 +499,7 @@ void xfs_iunlink_destroy(struct xfs_perag *pag); void xfs_end_io(struct work_struct *work); +int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index ba47bf65b772..895f61b2b4f0 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -439,6 +439,7 @@ xfs_inode_item_pin( struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(lip->li_buf); trace_xfs_inode_pin(ip, _RET_IP_); atomic_inc(&ip->i_pincount); @@ -450,6 +451,12 @@ xfs_inode_item_pin( * item which was previously pinned with a call to xfs_inode_item_pin(). * * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. + * + * Note that unpin can race with inode cluster buffer freeing marking the buffer + * stale. In that case, flush completions are run from the buffer unpin call, + * which may happen before the inode is unpinned. If we lose the race, there + * will be no buffer attached to the log item, but the inode will be marked + * XFS_ISTALE. */ STATIC void xfs_inode_item_unpin( @@ -459,28 +466,12 @@ xfs_inode_item_unpin( struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); + ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } -/* - * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer - * have been failed during writeback - * - * This informs the AIL that the inode is already flush locked on the next push, - * and acquires a hold on the buffer to ensure that it isn't reclaimed before - * dirty data makes it to disk. - */ -STATIC void -xfs_inode_item_error( - struct xfs_log_item *lip, - struct xfs_buf *bp) -{ - ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode)); - xfs_set_li_failed(lip, bp); -} - STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, @@ -494,55 +485,44 @@ xfs_inode_item_push( uint rval = XFS_ITEM_SUCCESS; int error; - if (xfs_ipincount(ip) > 0) + ASSERT(iip->ili_item.li_buf); + + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || + (ip->i_flags & XFS_ISTALE)) return XFS_ITEM_PINNED; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) - return XFS_ITEM_LOCKED; + /* If the inode is already flush locked, we're already flushing. */ + if (xfs_isiflocked(ip)) + return XFS_ITEM_FLUSHING; - /* - * Re-check the pincount now that we stabilized the value by - * taking the ilock. - */ - if (xfs_ipincount(ip) > 0) { - rval = XFS_ITEM_PINNED; - goto out_unlock; - } + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; - /* - * Stale inode items should force out the iclog. - */ - if (ip->i_flags & XFS_ISTALE) { - rval = XFS_ITEM_PINNED; - goto out_unlock; - } + spin_unlock(&lip->li_ailp->ail_lock); /* - * Someone else is already flushing the inode. Nothing we can do - * here but wait for the flush to finish and remove the item from - * the AIL. + * We need to hold a reference for flushing the cluster buffer as it may + * fail the buffer without IO submission. In which case, we better get a + * reference for that completion because otherwise we don't get a + * reference for IO until we queue the buffer for delwri submission. */ - if (!xfs_iflock_nowait(ip)) { - rval = XFS_ITEM_FLUSHING; - goto out_unlock; - } - - ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); - ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); - - spin_unlock(&lip->li_ailp->ail_lock); - - error = xfs_iflush(ip, &bp); + xfs_buf_hold(bp); + error = xfs_iflush_cluster(bp); if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } else if (error == -EAGAIN) + } else { + /* + * Release the buffer if we were unable to flush anything. On + * any other error, the buffer has already been released. + */ + if (error == -EAGAIN) + xfs_buf_relse(bp); rval = XFS_ITEM_LOCKED; + } spin_lock(&lip->li_ailp->ail_lock); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_SHARED); return rval; } @@ -621,7 +601,6 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, .iop_committing = xfs_inode_item_committing, - .iop_error = xfs_inode_item_error }; @@ -636,9 +615,11 @@ xfs_inode_item_init( struct xfs_inode_log_item *iip; ASSERT(ip->i_itemp == NULL); - iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0); + iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_zone, + GFP_KERNEL | __GFP_NOFAIL); iip->ili_inode = ip; + spin_lock_init(&iip->ili_lock); xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, &xfs_inode_item_ops); } @@ -648,110 +629,129 @@ xfs_inode_item_init( */ void xfs_inode_item_destroy( - xfs_inode_t *ip) + struct xfs_inode *ip) { - kmem_free(ip->i_itemp->ili_item.li_lv_shadow); - kmem_cache_free(xfs_ili_zone, ip->i_itemp); + struct xfs_inode_log_item *iip = ip->i_itemp; + + ASSERT(iip->ili_item.li_buf == NULL); + + ip->i_itemp = NULL; + kmem_free(iip->ili_item.li_lv_shadow); + kmem_cache_free(xfs_ili_zone, iip); } /* - * This is the inode flushing I/O completion routine. It is called - * from interrupt level when the buffer containing the inode is - * flushed to disk. It is responsible for removing the inode item - * from the AIL if it has not been re-logged, and unlocking the inode's - * flush lock. - * - * To reduce AIL lock traffic as much as possible, we scan the buffer log item - * list for other inodes that will run this function. We remove them from the - * buffer list so we can process all the inode IO completions in one AIL lock - * traversal. + * We only want to pull the item from the AIL if it is actually there + * and its location in the log has not changed since we started the + * flush. Thus, we only bother if the inode's lsn has not changed. */ -void -xfs_iflush_done( - struct xfs_buf *bp, - struct xfs_log_item *lip) +static void +xfs_iflush_ail_updates( + struct xfs_ail *ailp, + struct list_head *list) { - struct xfs_inode_log_item *iip; - struct xfs_log_item *blip, *n; - struct xfs_ail *ailp = lip->li_ailp; - int need_ail = 0; - LIST_HEAD(tmp); + struct xfs_log_item *lip; + xfs_lsn_t tail_lsn = 0; - /* - * Scan the buffer IO completions for other inodes being completed and - * attach them to the current inode log item. - */ + /* this is an opencoded batch version of xfs_trans_ail_delete */ + spin_lock(&ailp->ail_lock); + list_for_each_entry(lip, list, li_bio_list) { + xfs_lsn_t lsn; - list_add_tail(&lip->li_bio_list, &tmp); - - list_for_each_entry_safe(blip, n, &bp->b_li_list, li_bio_list) { - if (lip->li_cb != xfs_iflush_done) + clear_bit(XFS_LI_FAILED, &lip->li_flags); + if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn) continue; - list_move_tail(&blip->li_bio_list, &tmp); + lsn = xfs_ail_delete_one(ailp, lip); + if (!tail_lsn && lsn) + tail_lsn = lsn; + } + xfs_ail_update_finish(ailp, tail_lsn); +} + +/* + * Walk the list of inodes that have completed their IOs. If they are clean + * remove them from the list and dissociate them from the buffer. Buffers that + * are still dirty remain linked to the buffer and on the list. Caller must + * handle them appropriately. + */ +static void +xfs_iflush_finish( + struct xfs_buf *bp, + struct list_head *list) +{ + struct xfs_log_item *lip, *n; + + list_for_each_entry_safe(lip, n, list, li_bio_list) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + bool drop_buffer = false; + + spin_lock(&iip->ili_lock); + /* - * while we have the item, do the unlocked check for needing - * the AIL lock. + * Remove the reference to the cluster buffer if the inode is + * clean in memory and drop the buffer reference once we've + * dropped the locks we hold. */ - iip = INODE_ITEM(blip); - if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || - test_bit(XFS_LI_FAILED, &blip->li_flags)) - need_ail++; + ASSERT(iip->ili_item.li_buf == bp); + if (!iip->ili_fields) { + iip->ili_item.li_buf = NULL; + list_del_init(&lip->li_bio_list); + drop_buffer = true; + } + iip->ili_last_fields = 0; + iip->ili_flush_lsn = 0; + spin_unlock(&iip->ili_lock); + xfs_ifunlock(iip->ili_inode); + if (drop_buffer) + xfs_buf_rele(bp); } +} - /* make sure we capture the state of the initial inode. */ - iip = INODE_ITEM(lip); - if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || - test_bit(XFS_LI_FAILED, &lip->li_flags)) - need_ail++; +/* + * Inode buffer IO completion routine. It is responsible for removing inodes + * attached to the buffer from the AIL if they have not been re-logged, as well + * as completing the flush and unlocking the inode. + */ +void +xfs_iflush_done( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip, *n; + LIST_HEAD(flushed_inodes); + LIST_HEAD(ail_updates); /* - * We only want to pull the item from the AIL if it is - * actually there and its location in the log has not - * changed since we started the flush. Thus, we only bother - * if the ili_logged flag is set and the inode's lsn has not - * changed. First we check the lsn outside - * the lock since it's cheaper, and then we recheck while - * holding the lock before removing the inode from the AIL. + * Pull the attached inodes from the buffer one at a time and take the + * appropriate action on them. */ - if (need_ail) { - xfs_lsn_t tail_lsn = 0; - - /* this is an opencoded batch version of xfs_trans_ail_delete */ - spin_lock(&ailp->ail_lock); - list_for_each_entry(blip, &tmp, li_bio_list) { - if (INODE_ITEM(blip)->ili_logged && - blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) { - /* - * xfs_ail_update_finish() only cares about the - * lsn of the first tail item removed, any - * others will be at the same or higher lsn so - * we just ignore them. - */ - xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip); - if (!tail_lsn && lsn) - tail_lsn = lsn; - } else { - xfs_clear_li_failed(blip); - } + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + + if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) { + xfs_iflush_abort(iip->ili_inode); + continue; } - xfs_ail_update_finish(ailp, tail_lsn); + if (!iip->ili_last_fields) + continue; + + /* Do an unlocked check for needing the AIL lock. */ + if (iip->ili_flush_lsn == lip->li_lsn || + test_bit(XFS_LI_FAILED, &lip->li_flags)) + list_move_tail(&lip->li_bio_list, &ail_updates); + else + list_move_tail(&lip->li_bio_list, &flushed_inodes); } - /* - * clean up and unlock the flush lock now we are done. We can clear the - * ili_last_fields bits now that we know that the data corresponding to - * them is safely on disk. - */ - list_for_each_entry_safe(blip, n, &tmp, li_bio_list) { - list_del_init(&blip->li_bio_list); - iip = INODE_ITEM(blip); - iip->ili_logged = 0; - iip->ili_last_fields = 0; - xfs_ifunlock(iip->ili_inode); + if (!list_empty(&ail_updates)) { + xfs_iflush_ail_updates(bp->b_mount->m_ail, &ail_updates); + list_splice_tail(&ail_updates, &flushed_inodes); } - list_del(&tmp); + + xfs_iflush_finish(bp, &flushed_inodes); + if (!list_empty(&flushed_inodes)) + list_splice_tail(&flushed_inodes, &bp->b_li_list); } /* @@ -762,37 +762,37 @@ xfs_iflush_done( */ void xfs_iflush_abort( - struct xfs_inode *ip) + struct xfs_inode *ip) { - struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_buf *bp = NULL; if (iip) { - xfs_trans_ail_delete(&iip->ili_item, 0); - iip->ili_logged = 0; /* - * Clear the ili_last_fields bits now that we know that the - * data corresponding to them is safely on disk. + * Clear the failed bit before removing the item from the AIL so + * xfs_trans_ail_delete() doesn't try to clear and release the + * buffer attached to the log item before we are done with it. */ - iip->ili_last_fields = 0; + clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); + xfs_trans_ail_delete(&iip->ili_item, 0); + /* * Clear the inode logging fields so no more flushes are * attempted. */ + spin_lock(&iip->ili_lock); + iip->ili_last_fields = 0; iip->ili_fields = 0; iip->ili_fsync_fields = 0; + iip->ili_flush_lsn = 0; + bp = iip->ili_item.li_buf; + iip->ili_item.li_buf = NULL; + list_del_init(&iip->ili_item.li_bio_list); + spin_unlock(&iip->ili_lock); } - /* - * Release the inode's flush lock since we're done with it. - */ xfs_ifunlock(ip); -} - -void -xfs_istale_done( - struct xfs_buf *bp, - struct xfs_log_item *lip) -{ - xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); + if (bp) + xfs_buf_rele(bp); } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 60b34bb66e8e..048b5e7dee90 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -16,24 +16,34 @@ struct xfs_mount; struct xfs_inode_log_item { struct xfs_log_item ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ - xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ - xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ - unsigned short ili_lock_flags; /* lock flags */ - unsigned short ili_logged; /* flushed logged data */ + unsigned short ili_lock_flags; /* inode lock flags */ + /* + * The ili_lock protects the interactions between the dirty state and + * the flush state of the inode log item. This allows us to do atomic + * modifications of multiple state fields without having to hold a + * specific inode lock to serialise them. + * + * We need atomic changes between inode dirtying, inode flushing and + * inode completion, but these all hold different combinations of + * ILOCK and iflock and hence we need some other method of serialising + * updates to the flush state. + */ + spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* logged since last fsync */ + xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ }; -static inline int xfs_inode_clean(xfs_inode_t *ip) +static inline int xfs_inode_clean(struct xfs_inode *ip) { return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); } extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); -extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); -extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_iflush_done(struct xfs_buf *); extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index dc3e26ff16c9..5e0d291835b3 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -376,7 +376,7 @@ out_owner_change: xfs_dinode_calc_crc(log->l_mp, dip); ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; + bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_delwri_queue(bp, buffer_list); out_release: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a190212ca85d..6f22a66777cd 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1075,13 +1075,18 @@ xfs_merge_ioc_xflags( xflags |= FS_XFLAG_NODUMP; else xflags &= ~FS_XFLAG_NODUMP; + if (flags & FS_DAX_FL) + xflags |= FS_XFLAG_DAX; + else + xflags &= ~FS_XFLAG_DAX; return xflags; } STATIC unsigned int xfs_di2lxflags( - uint16_t di_flags) + uint16_t di_flags, + uint64_t di_flags2) { unsigned int flags = 0; @@ -1095,6 +1100,9 @@ xfs_di2lxflags( flags |= FS_NOATIME_FL; if (di_flags & XFS_DIFLAG_NODUMP) flags |= FS_NODUMP_FL; + if (di_flags2 & XFS_DIFLAG2_DAX) { + flags |= FS_DAX_FL; + } return flags; } @@ -1565,7 +1573,7 @@ xfs_ioc_getxflags( { unsigned int flags; - flags = xfs_di2lxflags(ip->i_d.di_flags); + flags = xfs_di2lxflags(ip->i_d.di_flags, ip->i_d.di_flags2); if (copy_to_user(arg, &flags, sizeof(flags))) return -EFAULT; return 0; @@ -1588,7 +1596,7 @@ xfs_ioc_setxflags( if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NOATIME_FL | FS_NODUMP_FL | \ - FS_SYNC_FL)) + FS_SYNC_FL | FS_DAX_FL)) return -EOPNOTSUPP; fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index b9a8c3798e08..0e3f62cde375 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -293,11 +293,11 @@ out_trans_cancel: STATIC bool xfs_quota_need_throttle( - struct xfs_inode *ip, - int type, - xfs_fsblock_t alloc_blocks) + struct xfs_inode *ip, + xfs_dqtype_t type, + xfs_fsblock_t alloc_blocks) { - struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); if (!dq || !xfs_this_quota_on(ip->i_mount, type)) return false; @@ -307,7 +307,7 @@ xfs_quota_need_throttle( return false; /* under the lo watermark, no throttle */ - if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) + if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark) return false; return true; @@ -315,24 +315,24 @@ xfs_quota_need_throttle( STATIC void xfs_quota_calc_throttle( - struct xfs_inode *ip, - int type, - xfs_fsblock_t *qblocks, - int *qshift, - int64_t *qfreesp) + struct xfs_inode *ip, + xfs_dqtype_t type, + xfs_fsblock_t *qblocks, + int *qshift, + int64_t *qfreesp) { - int64_t freesp; - int shift = 0; - struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + int64_t freesp; + int shift = 0; /* no dq, or over hi wmark, squash the prealloc completely */ - if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) { *qblocks = 0; *qfreesp = 0; return; } - freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; + freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved; if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { shift = 2; if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) @@ -450,14 +450,14 @@ xfs_iomap_prealloc_size( * Check each quota to cap the prealloc size, provide a shift value to * throttle with and adjust amount of available space. */ - if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift, + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift, &freesp); - if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift, + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift, &freesp); - if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift, + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift, &freesp); /* diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 9f70d2f68e05..ab737fed7b12 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -102,12 +102,8 @@ typedef __u32 xfs_nlink_t; #define xfs_cowb_secs xfs_params.cowb_timer.val #define current_cpu() (raw_smp_processor_id()) -#define current_pid() (current->pid) -#define current_test_flags(f) (current->flags & (f)) #define current_set_flags_nested(sp, f) \ (*(sp) = current->flags, current->flags |= (f)) -#define current_clear_flags_nested(sp, f) \ - (*(sp) = current->flags, current->flags &= ~(f)) #define current_restore_flags_nested(sp, f) \ (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 00fda2e8e738..ad0c69ee8947 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -433,7 +433,7 @@ xfs_log_reserve( XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -3408,15 +3408,12 @@ xlog_ticket_alloc( int unit_bytes, int cnt, char client, - bool permanent, - xfs_km_flags_t alloc_flags) + bool permanent) { struct xlog_ticket *tic; int unit_res; - tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); - if (!tic) - return NULL; + tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL); unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9ed90368ab31..56c32eecffea 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -37,8 +37,7 @@ xlog_cil_ticket_alloc( { struct xlog_ticket *tic; - tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, - KM_NOFS); + tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0); /* * set the current reservation to zero so we know to steal the basic diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 75a62870b63a..1c6fdbf3d506 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -464,9 +464,7 @@ xlog_ticket_alloc( int unit_bytes, int count, char client, - bool permanent, - xfs_km_flags_t alloc_flags); - + bool permanent); static inline void xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ec015df55b77..52a65a74208f 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -287,9 +287,8 @@ xlog_recover_iodone( if (bp->b_log_item) xfs_buf_item_relse(bp); ASSERT(bp->b_log_item == NULL); - - bp->b_iodone = NULL; - xfs_buf_ioend(bp); + bp->b_flags &= ~_XBF_LOGRECOVERY; + xfs_buf_ioend_finish(bp); } /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d5dcf9869860..c8ae49a1e99c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -148,7 +148,6 @@ xfs_free_perag( ASSERT(atomic_read(&pag->pag_ref) == 0); xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); - mutex_destroy(&pag->pag_ici_reclaim_lock); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -200,7 +199,6 @@ xfs_initialize_perag( pag->pag_agno = index; pag->pag_mount = mp; spin_lock_init(&pag->pag_ici_lock); - mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); if (xfs_buf_hash_init(pag)) goto out_free_pag; @@ -242,7 +240,6 @@ xfs_initialize_perag( out_hash_destroy: xfs_buf_hash_destroy(pag); out_free_pag: - mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ @@ -252,7 +249,6 @@ out_unwind_new_pags: break; xfs_buf_hash_destroy(pag); xfs_iunlink_destroy(pag); - mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); } return error; @@ -1015,7 +1011,7 @@ xfs_mountfs( * quota inodes. */ cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_reclaim_inodes(mp); xfs_health_unmount(mp); out_log_dealloc: mp->m_flags |= XFS_MOUNT_UNMOUNTING; @@ -1092,13 +1088,12 @@ xfs_unmountfs( xfs_ail_push_all_sync(mp->m_ail); /* - * And reclaim all inodes. At this point there should be no dirty - * inodes and none should be pinned or locked, but use synchronous - * reclaim just to be sure. We can stop background inode reclaim - * here as well if it is still running. + * Reclaim all inodes. At this point there should be no dirty inodes and + * none should be pinned or locked. Stop background inode reclaim here + * if it is still running. */ cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_reclaim_inodes(mp); xfs_health_unmount(mp); xfs_qm_unmount(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 3725d25ad97e..a72cfcaa4ad1 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -354,7 +354,6 @@ typedef struct xfs_perag { spinlock_t pag_ici_lock; /* incore inode cache lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ int pag_ici_reclaimable; /* reclaimable inodes */ - struct mutex pag_ici_reclaim_lock; /* serialisation point */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ /* buffer cache index */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index d6cd83317344..be67570badf8 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -47,7 +47,7 @@ STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); STATIC int xfs_qm_dquot_walk( struct xfs_mount *mp, - int type, + xfs_dqtype_t type, int (*execute)(struct xfs_dquot *dqp, void *data), void *data) { @@ -79,7 +79,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_dquot *dqp = batch[i]; - next_index = be32_to_cpu(dqp->q_core.d_id) + 1; + next_index = dqp->q_id + 1; error = execute(batch[i], data); if (error == -EAGAIN) { @@ -124,10 +124,10 @@ xfs_qm_dqpurge( int error = -EAGAIN; xfs_dqlock(dqp); - if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) + if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0) goto out_unlock; - dqp->dq_flags |= XFS_DQ_FREEING; + dqp->q_flags |= XFS_DQFLAG_FREEING; xfs_dqflock(dqp); @@ -148,6 +148,7 @@ xfs_qm_dqpurge( error = xfs_bwrite(bp); xfs_buf_relse(bp); } else if (error == -EAGAIN) { + dqp->q_flags &= ~XFS_DQFLAG_FREEING; goto out_unlock; } xfs_dqflock(dqp); @@ -160,8 +161,7 @@ xfs_qm_dqpurge( xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), - be32_to_cpu(dqp->q_core.d_id)); + radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id); qi->qi_dquots--; /* @@ -189,11 +189,11 @@ xfs_qm_dqpurge_all( uint flags) { if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_GQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL); } /* @@ -250,7 +250,7 @@ STATIC int xfs_qm_dqattach_one( struct xfs_inode *ip, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, bool doalloc, struct xfs_dquot **IO_idqpp) { @@ -331,7 +331,7 @@ xfs_qm_dqattach_locked( if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)), - XFS_DQ_USER, doalloc, &ip->i_udquot); + XFS_DQTYPE_USER, doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); @@ -339,14 +339,14 @@ xfs_qm_dqattach_locked( if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)), - XFS_DQ_GROUP, doalloc, &ip->i_gdquot); + XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, + error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQTYPE_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; @@ -473,7 +473,7 @@ xfs_qm_dquot_isolate( /* * Prevent lookups now that we are past the point of no return. */ - dqp->dq_flags |= XFS_DQ_FREEING; + dqp->q_flags |= XFS_DQFLAG_FREEING; xfs_dqunlock(dqp); ASSERT(dqp->q_nrefs == 0); @@ -545,31 +545,29 @@ xfs_qm_shrink_count( STATIC void xfs_qm_set_defquota( struct xfs_mount *mp, - uint type, + xfs_dqtype_t type, struct xfs_quotainfo *qinf) { struct xfs_dquot *dqp; struct xfs_def_quota *defq; - struct xfs_disk_dquot *ddqp; int error; error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); if (error) return; - ddqp = &dqp->q_core; defq = xfs_get_defquota(qinf, xfs_dquot_type(dqp)); /* * Timers and warnings have been already set, let's just set the * default limits for this quota type */ - defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); - defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); - defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); - defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); - defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); - defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + defq->blk.hard = dqp->q_blk.hardlimit; + defq->blk.soft = dqp->q_blk.softlimit; + defq->ino.hard = dqp->q_ino.hardlimit; + defq->ino.soft = dqp->q_ino.softlimit; + defq->rtb.hard = dqp->q_rtb.hardlimit; + defq->rtb.soft = dqp->q_rtb.softlimit; xfs_qm_dqdestroy(dqp); } @@ -577,22 +575,21 @@ xfs_qm_set_defquota( static void xfs_qm_init_timelimits( struct xfs_mount *mp, - uint type) + xfs_dqtype_t type) { struct xfs_quotainfo *qinf = mp->m_quotainfo; struct xfs_def_quota *defq; - struct xfs_disk_dquot *ddqp; struct xfs_dquot *dqp; int error; defq = xfs_get_defquota(qinf, type); - defq->btimelimit = XFS_QM_BTIMELIMIT; - defq->itimelimit = XFS_QM_ITIMELIMIT; - defq->rtbtimelimit = XFS_QM_RTBTIMELIMIT; - defq->bwarnlimit = XFS_QM_BWARNLIMIT; - defq->iwarnlimit = XFS_QM_IWARNLIMIT; - defq->rtbwarnlimit = XFS_QM_RTBWARNLIMIT; + defq->blk.time = XFS_QM_BTIMELIMIT; + defq->ino.time = XFS_QM_ITIMELIMIT; + defq->rtb.time = XFS_QM_RTBTIMELIMIT; + defq->blk.warn = XFS_QM_BWARNLIMIT; + defq->ino.warn = XFS_QM_IWARNLIMIT; + defq->rtb.warn = XFS_QM_RTBWARNLIMIT; /* * We try to get the limits from the superuser's limits fields. @@ -605,25 +602,23 @@ xfs_qm_init_timelimits( if (error) return; - ddqp = &dqp->q_core; - /* * The warnings and timers set the grace period given to * a user or group before he or she can not perform any * more writing. If it is zero, a default is used. */ - if (ddqp->d_btimer) - defq->btimelimit = be32_to_cpu(ddqp->d_btimer); - if (ddqp->d_itimer) - defq->itimelimit = be32_to_cpu(ddqp->d_itimer); - if (ddqp->d_rtbtimer) - defq->rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); - if (ddqp->d_bwarns) - defq->bwarnlimit = be16_to_cpu(ddqp->d_bwarns); - if (ddqp->d_iwarns) - defq->iwarnlimit = be16_to_cpu(ddqp->d_iwarns); - if (ddqp->d_rtbwarns) - defq->rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); + if (dqp->q_blk.timer) + defq->blk.time = dqp->q_blk.timer; + if (dqp->q_ino.timer) + defq->ino.time = dqp->q_ino.timer; + if (dqp->q_rtb.timer) + defq->rtb.time = dqp->q_rtb.timer; + if (dqp->q_blk.warnings) + defq->blk.warn = dqp->q_blk.warnings; + if (dqp->q_ino.warnings) + defq->ino.warn = dqp->q_ino.warnings; + if (dqp->q_rtb.warnings) + defq->rtb.warn = dqp->q_rtb.warnings; xfs_qm_dqdestroy(dqp); } @@ -669,16 +664,16 @@ xfs_qm_init_quotainfo( mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); - xfs_qm_init_timelimits(mp, XFS_DQ_USER); - xfs_qm_init_timelimits(mp, XFS_DQ_GROUP); - xfs_qm_init_timelimits(mp, XFS_DQ_PROJ); + xfs_qm_init_timelimits(mp, XFS_DQTYPE_USER); + xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP); + xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ); if (XFS_IS_UQUOTA_RUNNING(mp)) - xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); + xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf); if (XFS_IS_GQUOTA_RUNNING(mp)) - xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf); + xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf); if (XFS_IS_PQUOTA_RUNNING(mp)) - xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf); + xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf); qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; @@ -828,14 +823,13 @@ xfs_qm_qino_alloc( STATIC void xfs_qm_reset_dqcounts( - xfs_mount_t *mp, - xfs_buf_t *bp, - xfs_dqid_t id, - uint type) + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_dqid_t id, + xfs_dqtype_t type) { struct xfs_dqblk *dqb; int j; - xfs_failaddr_t fa; trace_xfs_reset_dqcounts(bp, _RET_IP_); @@ -860,15 +854,15 @@ xfs_qm_reset_dqcounts( * find uninitialised dquot blks. See comment in * xfs_dquot_verify. */ - fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type); - if (fa) + if (xfs_dqblk_verify(mp, &dqb[j], id + j) || + (dqb[j].dd_diskdq.d_type & XFS_DQTYPE_REC_MASK) != type) xfs_dqblk_repair(mp, &dqb[j], id + j, type); /* * Reset type in case we are reusing group quota file for * project quotas or vice versa */ - ddq->d_flags = type; + ddq->d_type = type; ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; @@ -901,17 +895,13 @@ xfs_qm_reset_dqcounts_all( xfs_dqid_t firstid, xfs_fsblock_t bno, xfs_filblks_t blkcnt, - uint flags, + xfs_dqtype_t type, struct list_head *buffer_list) { struct xfs_buf *bp; - int error; - int type; + int error = 0; ASSERT(blkcnt > 0); - type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : - (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); - error = 0; /* * Blkcnt arg can be a very big number, and might even be @@ -971,7 +961,7 @@ STATIC int xfs_qm_reset_dqcounts_buf( struct xfs_mount *mp, struct xfs_inode *qip, - uint flags, + xfs_dqtype_t type, struct list_head *buffer_list) { struct xfs_bmbt_irec *map; @@ -1047,7 +1037,7 @@ xfs_qm_reset_dqcounts_buf( error = xfs_qm_reset_dqcounts_all(mp, firstid, map[i].br_startblock, map[i].br_blockcount, - flags, buffer_list); + type, buffer_list); if (error) goto out; } @@ -1069,7 +1059,7 @@ out: STATIC int xfs_qm_quotacheck_dqadjust( struct xfs_inode *ip, - uint type, + xfs_dqtype_t type, xfs_qcnt_t nblks, xfs_qcnt_t rtblks) { @@ -1095,15 +1085,15 @@ xfs_qm_quotacheck_dqadjust( * Adjust the inode count and the block count to reflect this inode's * resource usage. */ - be64_add_cpu(&dqp->q_core.d_icount, 1); - dqp->q_res_icount++; + dqp->q_ino.count++; + dqp->q_ino.reserved++; if (nblks) { - be64_add_cpu(&dqp->q_core.d_bcount, nblks); - dqp->q_res_bcount += nblks; + dqp->q_blk.count += nblks; + dqp->q_blk.reserved += nblks; } if (rtblks) { - be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); - dqp->q_res_rtbcount += rtblks; + dqp->q_rtb.count += rtblks; + dqp->q_rtb.reserved += rtblks; } /* @@ -1111,12 +1101,12 @@ xfs_qm_quotacheck_dqadjust( * * There are no timers for the default values set in the root dquot. */ - if (dqp->q_core.d_id) { - xfs_qm_adjust_dqlimits(mp, dqp); - xfs_qm_adjust_dqtimers(mp, dqp); + if (dqp->q_id) { + xfs_qm_adjust_dqlimits(dqp); + xfs_qm_adjust_dqtimers(dqp); } - dqp->dq_flags |= XFS_DQ_DIRTY; + dqp->q_flags |= XFS_DQFLAG_DIRTY; xfs_qm_dqput(dqp); return 0; } @@ -1186,21 +1176,21 @@ xfs_qm_dqusage_adjust( * and quotaoffs don't race. (Quotachecks happen at mount time only). */ if (XFS_IS_UQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks, + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_USER, nblks, rtblks); if (error) goto error0; } if (XFS_IS_GQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks, + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_GROUP, nblks, rtblks); if (error) goto error0; } if (XFS_IS_PQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks, + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_PROJ, nblks, rtblks); if (error) goto error0; @@ -1222,7 +1212,7 @@ xfs_qm_flush_one( int error = 0; xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) + if (dqp->q_flags & XFS_DQFLAG_FREEING) goto out_unlock; if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; @@ -1291,7 +1281,7 @@ xfs_qm_quotacheck( * We don't log our changes till later. */ if (uip) { - error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_DQTYPE_USER, &buffer_list); if (error) goto error_return; @@ -1299,7 +1289,7 @@ xfs_qm_quotacheck( } if (gip) { - error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_DQTYPE_GROUP, &buffer_list); if (error) goto error_return; @@ -1307,7 +1297,7 @@ xfs_qm_quotacheck( } if (pip) { - error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_DQTYPE_PROJ, &buffer_list); if (error) goto error_return; @@ -1324,17 +1314,17 @@ xfs_qm_quotacheck( * down to disk buffers if everything was updated successfully. */ if (XFS_IS_UQUOTA_ON(mp)) { - error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, + error = xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_flush_one, &buffer_list); } if (XFS_IS_GQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, + error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_flush_one, &buffer_list); if (!error) error = error2; } if (XFS_IS_PQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, + error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_flush_one, &buffer_list); if (!error) error = error2; @@ -1597,8 +1587,7 @@ xfs_qm_dqfree_one( struct xfs_quotainfo *qi = mp->m_quotainfo; mutex_lock(&qi->qi_tree_lock); - radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), - be32_to_cpu(dqp->q_core.d_id)); + radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id); qi->qi_dquots--; mutex_unlock(&qi->qi_tree_lock); @@ -1673,7 +1662,7 @@ xfs_qm_vop_dqalloc( */ xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kuid(user_ns, uid), - XFS_DQ_USER, true, &uq); + XFS_DQTYPE_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; @@ -1697,7 +1686,7 @@ xfs_qm_vop_dqalloc( if (!gid_eq(inode->i_gid, gid)) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), - XFS_DQ_GROUP, true, &gq); + XFS_DQTYPE_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1713,8 +1702,8 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (ip->i_d.di_projid != prid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, - true, &pq); + error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, + XFS_DQTYPE_PROJ, true, &pq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1822,7 +1811,7 @@ xfs_qm_vop_chown_reserve( XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && - i_uid_read(VFS_I(ip)) != be32_to_cpu(udqp->q_core.d_id)) { + i_uid_read(VFS_I(ip)) != udqp->q_id) { udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to @@ -1835,7 +1824,7 @@ xfs_qm_vop_chown_reserve( } } if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - i_gid_read(VFS_I(ip)) != be32_to_cpu(gdqp->q_core.d_id)) { + i_gid_read(VFS_I(ip)) != gdqp->q_id) { gdq_delblks = gdqp; if (delblks) { ASSERT(ip->i_gdquot); @@ -1844,7 +1833,7 @@ xfs_qm_vop_chown_reserve( } if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && - ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) { + ip->i_d.di_projid != pdqp->q_id) { pdq_delblks = pdqp; if (delblks) { ASSERT(ip->i_pdquot); @@ -1928,21 +1917,21 @@ xfs_qm_vop_create_dqattach( if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(i_uid_read(VFS_I(ip)) == be32_to_cpu(udqp->q_core.d_id)); + ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(i_gid_read(VFS_I(ip)) == be32_to_cpu(gdqp->q_core.d_id)); + ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id); ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); - ASSERT(ip->i_d.di_projid == be32_to_cpu(pdqp->q_core.d_id)); + ASSERT(ip->i_d.di_projid == pdqp->q_id); ip->i_pdquot = xfs_qm_dqhold(pdqp); xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 7b0e771fcbce..9c078c35d924 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -20,41 +20,28 @@ extern struct kmem_zone *xfs_qm_dqtrxzone; #define XFS_DQITER_MAP_SIZE 10 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ - !dqp->q_core.d_blk_hardlimit && \ - !dqp->q_core.d_blk_softlimit && \ - !dqp->q_core.d_rtb_hardlimit && \ - !dqp->q_core.d_rtb_softlimit && \ - !dqp->q_core.d_ino_hardlimit && \ - !dqp->q_core.d_ino_softlimit && \ - !dqp->q_core.d_bcount && \ - !dqp->q_core.d_rtbcount && \ - !dqp->q_core.d_icount) - -/* - * This defines the unit of allocation of dquots. - * Currently, it is just one file system block, and a 4K blk contains 30 - * (136 * 30 = 4080) dquots. It's probably not worth trying to make - * this more dynamic. - * XXXsup However, if this number is changed, we have to make sure that we don't - * implicitly assume that we do allocations in chunks of a single filesystem - * block in the dquot/xqm code. - */ -#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 + !dqp->q_blk.hardlimit && \ + !dqp->q_blk.softlimit && \ + !dqp->q_rtb.hardlimit && \ + !dqp->q_rtb.softlimit && \ + !dqp->q_ino.hardlimit && \ + !dqp->q_ino.softlimit && \ + !dqp->q_blk.count && \ + !dqp->q_rtb.count && \ + !dqp->q_ino.count) + +struct xfs_quota_limits { + xfs_qcnt_t hard; /* default hard limit */ + xfs_qcnt_t soft; /* default soft limit */ + time64_t time; /* limit for timers */ + xfs_qwarncnt_t warn; /* limit for warnings */ +}; /* Defaults for each quota type: time limits, warn limits, usage limits */ struct xfs_def_quota { - time64_t btimelimit; /* limit for blks timer */ - time64_t itimelimit; /* limit for inodes timer */ - time64_t rtbtimelimit; /* limit for rt blks timer */ - xfs_qwarncnt_t bwarnlimit; /* limit for blks warnings */ - xfs_qwarncnt_t iwarnlimit; /* limit for inodes warnings */ - xfs_qwarncnt_t rtbwarnlimit; /* limit for rt blks warnings */ - xfs_qcnt_t bhardlimit; /* default data blk hard limit */ - xfs_qcnt_t bsoftlimit; /* default data blk soft limit */ - xfs_qcnt_t ihardlimit; /* default inode count hard limit */ - xfs_qcnt_t isoftlimit; /* default inode count soft limit */ - xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */ - xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */ + struct xfs_quota_limits blk; + struct xfs_quota_limits ino; + struct xfs_quota_limits rtb; }; /* @@ -83,14 +70,14 @@ struct xfs_quotainfo { static inline struct radix_tree_root * xfs_dquot_tree( struct xfs_quotainfo *qi, - int type) + xfs_dqtype_t type) { switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: return &qi->qi_uquota_tree; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return &qi->qi_gquota_tree; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return &qi->qi_pquota_tree; default: ASSERT(0); @@ -99,14 +86,14 @@ xfs_dquot_tree( } static inline struct xfs_inode * -xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) +xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type) { - switch (dq_flags & XFS_DQ_ALLTYPES) { - case XFS_DQ_USER: + switch (type) { + case XFS_DQTYPE_USER: return mp->m_quotainfo->qi_uquotaip; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return mp->m_quotainfo->qi_gquotaip; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return mp->m_quotainfo->qi_pquotaip; default: ASSERT(0); @@ -114,17 +101,6 @@ xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) return NULL; } -static inline int -xfs_dquot_type(struct xfs_dquot *dqp) -{ - if (XFS_QM_ISUDQ(dqp)) - return XFS_DQ_USER; - if (XFS_QM_ISGDQ(dqp)) - return XFS_DQ_GROUP; - ASSERT(XFS_QM_ISPDQ(dqp)); - return XFS_DQ_PROJ; -} - extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp, uint field, int64_t delta); extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); @@ -166,24 +142,30 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); -extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, - uint, struct qc_dqblk *); -extern int xfs_qm_scall_getquota_next(struct xfs_mount *, - xfs_dqid_t *, uint, struct qc_dqblk *); -extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, - struct qc_dqblk *); +extern int xfs_qm_scall_getquota(struct xfs_mount *mp, + xfs_dqid_t id, + xfs_dqtype_t type, + struct qc_dqblk *dst); +extern int xfs_qm_scall_getquota_next(struct xfs_mount *mp, + xfs_dqid_t *id, + xfs_dqtype_t type, + struct qc_dqblk *dst); +extern int xfs_qm_scall_setqlim(struct xfs_mount *mp, + xfs_dqid_t id, + xfs_dqtype_t type, + struct qc_dqblk *newlim); extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); static inline struct xfs_def_quota * -xfs_get_defquota(struct xfs_quotainfo *qi, int type) +xfs_get_defquota(struct xfs_quotainfo *qi, xfs_dqtype_t type) { switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: return &qi->qi_usr_default; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return &qi->qi_grp_default; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return &qi->qi_prj_default; default: ASSERT(0); diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index fc2fa418919f..639398091ad6 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -23,24 +23,24 @@ xfs_fill_statvfs_from_dquot( { uint64_t limit; - limit = dqp->q_core.d_blk_softlimit ? - be64_to_cpu(dqp->q_core.d_blk_softlimit) : - be64_to_cpu(dqp->q_core.d_blk_hardlimit); + limit = dqp->q_blk.softlimit ? + dqp->q_blk.softlimit : + dqp->q_blk.hardlimit; if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; statp->f_bfree = statp->f_bavail = - (statp->f_blocks > dqp->q_res_bcount) ? - (statp->f_blocks - dqp->q_res_bcount) : 0; + (statp->f_blocks > dqp->q_blk.reserved) ? + (statp->f_blocks - dqp->q_blk.reserved) : 0; } - limit = dqp->q_core.d_ino_softlimit ? - be64_to_cpu(dqp->q_core.d_ino_softlimit) : - be64_to_cpu(dqp->q_core.d_ino_hardlimit); + limit = dqp->q_ino.softlimit ? + dqp->q_ino.softlimit : + dqp->q_ino.hardlimit; if (limit && statp->f_files > limit) { statp->f_files = limit; statp->f_ffree = - (statp->f_files > dqp->q_res_icount) ? - (statp->f_files - dqp->q_res_icount) : 0; + (statp->f_files > dqp->q_ino.reserved) ? + (statp->f_files - dqp->q_ino.reserved) : 0; } } @@ -60,7 +60,7 @@ xfs_qm_statvfs( struct xfs_mount *mp = ip->i_mount; struct xfs_dquot *dqp; - if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQ_PROJ, false, &dqp)) { + if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQTYPE_PROJ, false, &dqp)) { xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 7effd7a28136..1c542b4a5220 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -322,23 +322,23 @@ xfs_qm_scall_trunc_qfiles( int error = -EINVAL; if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || - (flags & ~XFS_DQ_ALLTYPES)) { + (flags & ~XFS_QMOPT_QUOTALL)) { xfs_debug(mp, "%s: flags=%x m_qflags=%x", __func__, flags, mp->m_qflags); return -EINVAL; } - if (flags & XFS_DQ_USER) { + if (flags & XFS_QMOPT_UQUOTA) { error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); if (error) return error; } - if (flags & XFS_DQ_GROUP) { + if (flags & XFS_QMOPT_GQUOTA) { error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); if (error) return error; } - if (flags & XFS_DQ_PROJ) + if (flags & XFS_QMOPT_PQUOTA) error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); return error; @@ -437,20 +437,73 @@ xfs_qm_scall_quotaon( (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) /* + * Adjust limits of this quota, and the defaults if passed in. Returns true + * if the new limits made sense and were applied, false otherwise. + */ +static inline bool +xfs_setqlim_limits( + struct xfs_mount *mp, + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + xfs_qcnt_t hard, + xfs_qcnt_t soft, + const char *tag) +{ + /* The hard limit can't be less than the soft limit. */ + if (hard != 0 && hard < soft) { + xfs_debug(mp, "%shard %lld < %ssoft %lld", tag, hard, tag, + soft); + return false; + } + + res->hardlimit = hard; + res->softlimit = soft; + if (qlim) { + qlim->hard = hard; + qlim->soft = soft; + } + + return true; +} + +static inline void +xfs_setqlim_warns( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + int warns) +{ + res->warnings = warns; + if (qlim) + qlim->warn = warns; +} + +static inline void +xfs_setqlim_timer( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + s64 timer) +{ + res->timer = timer; + if (qlim) + qlim->time = timer; +} + +/* * Adjust quota limits, and start/stop timers accordingly. */ int xfs_qm_scall_setqlim( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct qc_dqblk *newlim) { struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_disk_dquot *ddq; struct xfs_dquot *dqp; struct xfs_trans *tp; struct xfs_def_quota *defq; + struct xfs_dquot_res *res; + struct xfs_quota_limits *qlim; int error; xfs_qcnt_t hard, soft; @@ -488,105 +541,72 @@ xfs_qm_scall_setqlim( xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); - ddq = &dqp->q_core; /* + * Update quota limits, warnings, and timers, and the defaults + * if we're touching id == 0. + * * Make sure that hardlimits are >= soft limits before changing. + * + * Update warnings counter(s) if requested. + * + * Timelimits for the super user set the relative time the other users + * can be over quota for this file system. If it is zero a default is + * used. Ditto for the default soft and hard limit values (already + * done, above), and for warnings. + * + * For other IDs, userspace can bump out the grace period if over + * the soft limit. */ + + /* Blocks on the data device. */ hard = (newlim->d_fieldmask & QC_SPC_HARD) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) : - be64_to_cpu(ddq->d_blk_hardlimit); + dqp->q_blk.hardlimit; soft = (newlim->d_fieldmask & QC_SPC_SOFT) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) : - be64_to_cpu(ddq->d_blk_softlimit); - if (hard == 0 || hard >= soft) { - ddq->d_blk_hardlimit = cpu_to_be64(hard); - ddq->d_blk_softlimit = cpu_to_be64(soft); + dqp->q_blk.softlimit; + res = &dqp->q_blk; + qlim = id == 0 ? &defq->blk : NULL; + + if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk")) xfs_dquot_set_prealloc_limits(dqp); - if (id == 0) { - defq->bhardlimit = hard; - defq->bsoftlimit = soft; - } - } else { - xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); - } + if (newlim->d_fieldmask & QC_SPC_WARNS) + xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); + if (newlim->d_fieldmask & QC_SPC_TIMER) + xfs_setqlim_timer(res, qlim, newlim->d_spc_timer); + + /* Blocks on the realtime device. */ hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) : - be64_to_cpu(ddq->d_rtb_hardlimit); + dqp->q_rtb.hardlimit; soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) : - be64_to_cpu(ddq->d_rtb_softlimit); - if (hard == 0 || hard >= soft) { - ddq->d_rtb_hardlimit = cpu_to_be64(hard); - ddq->d_rtb_softlimit = cpu_to_be64(soft); - if (id == 0) { - defq->rtbhardlimit = hard; - defq->rtbsoftlimit = soft; - } - } else { - xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); - } + dqp->q_rtb.softlimit; + res = &dqp->q_rtb; + qlim = id == 0 ? &defq->rtb : NULL; + xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb"); + if (newlim->d_fieldmask & QC_RT_SPC_WARNS) + xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); + if (newlim->d_fieldmask & QC_RT_SPC_TIMER) + xfs_setqlim_timer(res, qlim, newlim->d_rt_spc_timer); + + /* Inodes */ hard = (newlim->d_fieldmask & QC_INO_HARD) ? (xfs_qcnt_t) newlim->d_ino_hardlimit : - be64_to_cpu(ddq->d_ino_hardlimit); + dqp->q_ino.hardlimit; soft = (newlim->d_fieldmask & QC_INO_SOFT) ? (xfs_qcnt_t) newlim->d_ino_softlimit : - be64_to_cpu(ddq->d_ino_softlimit); - if (hard == 0 || hard >= soft) { - ddq->d_ino_hardlimit = cpu_to_be64(hard); - ddq->d_ino_softlimit = cpu_to_be64(soft); - if (id == 0) { - defq->ihardlimit = hard; - defq->isoftlimit = soft; - } - } else { - xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); - } + dqp->q_ino.softlimit; + res = &dqp->q_ino; + qlim = id == 0 ? &defq->ino : NULL; - /* - * Update warnings counter(s) if requested - */ - if (newlim->d_fieldmask & QC_SPC_WARNS) - ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns); + xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino"); if (newlim->d_fieldmask & QC_INO_WARNS) - ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns); - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns); - - if (id == 0) { - if (newlim->d_fieldmask & QC_SPC_WARNS) - defq->bwarnlimit = newlim->d_spc_warns; - if (newlim->d_fieldmask & QC_INO_WARNS) - defq->iwarnlimit = newlim->d_ino_warns; - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - defq->rtbwarnlimit = newlim->d_rt_spc_warns; - } - - /* - * Timelimits for the super user set the relative time the other users - * can be over quota for this file system. If it is zero a default is - * used. Ditto for the default soft and hard limit values (already - * done, above), and for warnings. - * - * For other IDs, userspace can bump out the grace period if over - * the soft limit. - */ - if (newlim->d_fieldmask & QC_SPC_TIMER) - ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer); + xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); if (newlim->d_fieldmask & QC_INO_TIMER) - ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer); - if (newlim->d_fieldmask & QC_RT_SPC_TIMER) - ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer); - - if (id == 0) { - if (newlim->d_fieldmask & QC_SPC_TIMER) - defq->btimelimit = newlim->d_spc_timer; - if (newlim->d_fieldmask & QC_INO_TIMER) - defq->itimelimit = newlim->d_ino_timer; - if (newlim->d_fieldmask & QC_RT_SPC_TIMER) - defq->rtbtimelimit = newlim->d_rt_spc_timer; - } + xfs_setqlim_timer(res, qlim, newlim->d_ino_timer); if (id != 0) { /* @@ -596,9 +616,9 @@ xfs_qm_scall_setqlim( * is on or off. We don't really want to bother with iterating * over all ondisk dquots and turning the timers on/off. */ - xfs_qm_adjust_dqtimers(mp, dqp); + xfs_qm_adjust_dqtimers(dqp); } - dqp->dq_flags |= XFS_DQ_DIRTY; + dqp->q_flags |= XFS_DQFLAG_DIRTY; xfs_trans_log_dquot(tp, dqp); error = xfs_trans_commit(tp); @@ -614,58 +634,46 @@ out_unlock: static void xfs_qm_scall_getquota_fill_qc( struct xfs_mount *mp, - uint type, + xfs_dqtype_t type, const struct xfs_dquot *dqp, struct qc_dqblk *dst) { memset(dst, 0, sizeof(*dst)); - dst->d_spc_hardlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); - dst->d_spc_softlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); - dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); - dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); - dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount); - dst->d_ino_count = dqp->q_res_icount; - dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer); - dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer); - dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns); - dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns); - dst->d_rt_spc_hardlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); - dst->d_rt_spc_softlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); - dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount); - dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer); - dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns); + dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_blk.hardlimit); + dst->d_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_blk.softlimit); + dst->d_ino_hardlimit = dqp->q_ino.hardlimit; + dst->d_ino_softlimit = dqp->q_ino.softlimit; + dst->d_space = XFS_FSB_TO_B(mp, dqp->q_blk.reserved); + dst->d_ino_count = dqp->q_ino.reserved; + dst->d_spc_timer = dqp->q_blk.timer; + dst->d_ino_timer = dqp->q_ino.timer; + dst->d_ino_warns = dqp->q_ino.warnings; + dst->d_spc_warns = dqp->q_blk.warnings; + dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit); + dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit); + dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved); + dst->d_rt_spc_timer = dqp->q_rtb.timer; + dst->d_rt_spc_warns = dqp->q_rtb.warnings; /* * Internally, we don't reset all the timers when quota enforcement * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ - if ((!XFS_IS_UQUOTA_ENFORCED(mp) && - dqp->q_core.d_flags == XFS_DQ_USER) || - (!XFS_IS_GQUOTA_ENFORCED(mp) && - dqp->q_core.d_flags == XFS_DQ_GROUP) || - (!XFS_IS_PQUOTA_ENFORCED(mp) && - dqp->q_core.d_flags == XFS_DQ_PROJ)) { + if (!xfs_dquot_is_enforced(dqp)) { dst->d_spc_timer = 0; dst->d_ino_timer = 0; dst->d_rt_spc_timer = 0; } #ifdef DEBUG - if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || - (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || - (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && - dqp->q_core.d_id != 0) { + if (xfs_dquot_is_enforced(dqp) && dqp->q_id != 0) { if ((dst->d_space > dst->d_spc_softlimit) && (dst->d_spc_softlimit > 0)) { ASSERT(dst->d_spc_timer != 0); } - if ((dst->d_ino_count > dst->d_ino_softlimit) && - (dst->d_ino_softlimit > 0)) { + if ((dst->d_ino_count > dqp->q_ino.softlimit) && + (dqp->q_ino.softlimit > 0)) { ASSERT(dst->d_ino_timer != 0); } } @@ -677,7 +685,7 @@ int xfs_qm_scall_getquota( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct qc_dqblk *dst) { struct xfs_dquot *dqp; @@ -715,7 +723,7 @@ int xfs_qm_scall_getquota_next( struct xfs_mount *mp, xfs_dqid_t *id, - uint type, + xfs_dqtype_t type, struct qc_dqblk *dst) { struct xfs_dquot *dqp; @@ -726,7 +734,7 @@ xfs_qm_scall_getquota_next( return error; /* Fill in the ID we actually read from disk */ - *id = be32_to_cpu(dqp->q_core.d_id); + *id = dqp->q_id; xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index aa8fc1f55fbd..06b22e35fc90 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -13,6 +13,7 @@ */ struct xfs_trans; +struct xfs_buf; /* * This check is done typically without holding the inode lock; @@ -38,14 +39,14 @@ struct xfs_trans; static inline uint xfs_quota_chkd_flag( - uint dqtype) + xfs_dqtype_t type) { - switch (dqtype) { - case XFS_DQ_USER: + switch (type) { + case XFS_DQTYPE_USER: return XFS_UQUOTA_CHKD; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return XFS_GQUOTA_CHKD; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return XFS_PQUOTA_CHKD; default: return 0; @@ -107,6 +108,8 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); +void xfs_dquot_done(struct xfs_buf *); + #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -148,6 +151,12 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) + +static inline void xfs_dquot_done(struct xfs_buf *bp) +{ + return; +} + #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index bf809b77a316..d27c0e852c0b 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -37,12 +37,12 @@ xfs_qm_fill_state( tstate->flags |= QCI_SYSFILE; tstate->blocks = ip->i_d.di_nblocks; tstate->nextents = ip->i_df.if_nextents; - tstate->spc_timelimit = (u32)defq->btimelimit; - tstate->ino_timelimit = (u32)defq->itimelimit; - tstate->rt_spc_timelimit = (u32)defq->rtbtimelimit; - tstate->spc_warnlimit = defq->bwarnlimit; - tstate->ino_warnlimit = defq->iwarnlimit; - tstate->rt_spc_warnlimit = defq->rtbwarnlimit; + tstate->spc_timelimit = (u32)defq->blk.time; + tstate->ino_timelimit = (u32)defq->ino.time; + tstate->rt_spc_timelimit = (u32)defq->rtb.time; + tstate->spc_warnlimit = defq->blk.warn; + tstate->ino_warnlimit = defq->ino.warn; + tstate->rt_spc_warnlimit = defq->rtb.warn; if (tempqip) xfs_irele(ip); } @@ -85,16 +85,16 @@ xfs_fs_get_quota_state( return 0; } -STATIC int +STATIC xfs_dqtype_t xfs_quota_type(int type) { switch (type) { case USRQUOTA: - return XFS_DQ_USER; + return XFS_DQTYPE_USER; case GRPQUOTA: - return XFS_DQ_GROUP; + return XFS_DQTYPE_GROUP; default: - return XFS_DQ_PROJ; + return XFS_DQTYPE_PROJ; } } @@ -205,11 +205,11 @@ xfs_fs_rm_xquota( return -EINVAL; if (uflags & FS_USER_QUOTA) - flags |= XFS_DQ_USER; + flags |= XFS_QMOPT_UQUOTA; if (uflags & FS_GROUP_QUOTA) - flags |= XFS_DQ_GROUP; + flags |= XFS_QMOPT_GQUOTA; if (uflags & FS_PROJ_QUOTA) - flags |= XFS_DQ_PROJ; + flags |= XFS_QMOPT_PQUOTA; return xfs_qm_scall_trunc_qfiles(mp, flags); } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index c81639891e29..7b2c72bc2858 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -143,7 +143,8 @@ xfs_cui_init( cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), 0); else - cuip = kmem_zone_zalloc(xfs_cui_zone, 0); + cuip = kmem_cache_zalloc(xfs_cui_zone, + GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); cuip->cui_format.cui_nextents = nextents; @@ -220,7 +221,7 @@ xfs_trans_get_cud( { struct xfs_cud_log_item *cudp; - cudp = kmem_zone_zalloc(xfs_cud_zone, 0); + cudp = kmem_cache_zalloc(xfs_cud_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); cudp->cud_cuip = cuip; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 107bf2a2f344..aac83f9d6107 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -179,7 +179,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { + if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { *shared = false; return 0; } @@ -655,7 +655,7 @@ xfs_reflink_end_cow_extent( * preallocations can leak into the range we are called upon, and we * need to skip them. */ - if (!xfs_bmap_is_real_extent(&got)) { + if (!xfs_bmap_is_written_extent(&got)) { *end_fsb = del.br_startoff; goto out_cancel; } @@ -984,40 +984,28 @@ xfs_reflink_ag_has_free_space( } /* - * Unmap a range of blocks from a file, then map other blocks into the hole. - * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). - * The extent irec is mapped into dest at irec->br_startoff. + * Remap the given extent into the file. The dmap blockcount will be set to + * the number of blocks that were actually remapped. */ STATIC int xfs_reflink_remap_extent( struct xfs_inode *ip, - struct xfs_bmbt_irec *irec, - xfs_fileoff_t destoff, + struct xfs_bmbt_irec *dmap, xfs_off_t new_isize) { + struct xfs_bmbt_irec smap; struct xfs_mount *mp = ip->i_mount; - bool real_extent = xfs_bmap_is_real_extent(irec); struct xfs_trans *tp; - unsigned int resblks; - struct xfs_bmbt_irec uirec; - xfs_filblks_t rlen; - xfs_filblks_t unmap_len; xfs_off_t newlen; + int64_t qres, qdelta; + unsigned int resblks; + bool smap_real; + bool dmap_written = xfs_bmap_is_written_extent(dmap); + int nimaps; int error; - unmap_len = irec->br_startoff + irec->br_blockcount - destoff; - trace_xfs_reflink_punch_range(ip, destoff, unmap_len); - - /* No reflinking if we're low on space */ - if (real_extent) { - error = xfs_reflink_ag_has_free_space(mp, - XFS_FSB_TO_AGNO(mp, irec->br_startblock)); - if (error) - goto out; - } - /* Start a rolling transaction to switch the mappings */ - resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) goto out; @@ -1025,87 +1013,147 @@ xfs_reflink_remap_extent( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - /* If we're not just clearing space, then do we have enough quota? */ - if (real_extent) { - error = xfs_trans_reserve_quota_nblks(tp, ip, - irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); + /* + * Read what's currently mapped in the destination file into smap. + * If smap isn't a hole, we will have to remove it before we can add + * dmap to the destination file. + */ + nimaps = 1; + error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount, + &smap, &nimaps, 0); + if (error) + goto out_cancel; + ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff); + smap_real = xfs_bmap_is_real_extent(&smap); + + /* + * We can only remap as many blocks as the smaller of the two extent + * maps, because we can only remap one extent at a time. + */ + dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount); + ASSERT(dmap->br_blockcount == smap.br_blockcount); + + trace_xfs_reflink_remap_extent_dest(ip, &smap); + + /* + * Two extents mapped to the same physical block must not have + * different states; that's filesystem corruption. Move on to the next + * extent if they're both holes or both the same physical extent. + */ + if (dmap->br_startblock == smap.br_startblock) { + if (dmap->br_state != smap.br_state) + error = -EFSCORRUPTED; + goto out_cancel; + } + + /* If both extents are unwritten, leave them alone. */ + if (dmap->br_state == XFS_EXT_UNWRITTEN && + smap.br_state == XFS_EXT_UNWRITTEN) + goto out_cancel; + + /* No reflinking if the AG of the dest mapping is low on space. */ + if (dmap_written) { + error = xfs_reflink_ag_has_free_space(mp, + XFS_FSB_TO_AGNO(mp, dmap->br_startblock)); if (error) goto out_cancel; } - trace_xfs_reflink_remap(ip, irec->br_startoff, - irec->br_blockcount, irec->br_startblock); - - /* Unmap the old blocks in the data fork. */ - rlen = unmap_len; - while (rlen) { - ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1); + /* + * Compute quota reservation if we think the quota block counter for + * this file could increase. + * + * Adding a written extent to the extent map can cause a bmbt split, + * and removing a mapped extent from the extent can cause a bmbt split. + * The two operations cannot both cause a split since they operate on + * the same index in the bmap btree, so we only need a reservation for + * one bmbt split if either thing is happening. + * + * If we are mapping a written extent into the file, we need to have + * enough quota block count reservation to handle the blocks in that + * extent. We log only the delta to the quota block counts, so if the + * extent we're unmapping also has blocks allocated to it, we don't + * need a quota reservation for the extent itself. + * + * Note that if we're replacing a delalloc reservation with a written + * extent, we have to take the full quota reservation because removing + * the delalloc reservation gives the block count back to the quota + * count. This is suboptimal, but the VFS flushed the dest range + * before we started. That should have removed all the delalloc + * reservations, but we code defensively. + */ + qres = qdelta = 0; + if (smap_real || dmap_written) + qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + if (!smap_real && dmap_written) + qres += dmap->br_blockcount; + if (qres > 0) { + error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, + XFS_QMOPT_RES_REGBLKS); if (error) goto out_cancel; + } + if (smap_real) { /* - * Trim the extent to whatever got unmapped. - * Remember, bunmapi works backwards. + * If the extent we're unmapping is backed by storage (written + * or not), unmap the extent and drop its refcount. */ - uirec.br_startblock = irec->br_startblock + rlen; - uirec.br_startoff = irec->br_startoff + rlen; - uirec.br_blockcount = unmap_len - rlen; - uirec.br_state = irec->br_state; - unmap_len = rlen; - - /* If this isn't a real mapping, we're done. */ - if (!real_extent || uirec.br_blockcount == 0) - goto next_extent; + xfs_bmap_unmap_extent(tp, ip, &smap); + xfs_refcount_decrease_extent(tp, &smap); + qdelta -= smap.br_blockcount; + } else if (smap.br_startblock == DELAYSTARTBLOCK) { + xfs_filblks_t len = smap.br_blockcount; - trace_xfs_reflink_remap(ip, uirec.br_startoff, - uirec.br_blockcount, uirec.br_startblock); - - /* Update the refcount tree */ - xfs_refcount_increase_extent(tp, &uirec); - - /* Map the new blocks into the data fork. */ - xfs_bmap_map_extent(tp, ip, &uirec); + /* + * If the extent we're unmapping is a delalloc reservation, + * we can use the regular bunmapi function to release the + * incore state. Dropping the delalloc reservation takes care + * of the quota reservation for us. + */ + error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1); + if (error) + goto out_cancel; + ASSERT(len == 0); + } - /* Update quota accounting. */ - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, - uirec.br_blockcount); + /* + * If the extent we're sharing is backed by written storage, increase + * its refcount and map it into the file. + */ + if (dmap_written) { + xfs_refcount_increase_extent(tp, dmap); + xfs_bmap_map_extent(tp, ip, dmap); + qdelta += dmap->br_blockcount; + } - /* Update dest isize if needed. */ - newlen = XFS_FSB_TO_B(mp, - uirec.br_startoff + uirec.br_blockcount); - newlen = min_t(xfs_off_t, newlen, new_isize); - if (newlen > i_size_read(VFS_I(ip))) { - trace_xfs_reflink_update_inode_size(ip, newlen); - i_size_write(VFS_I(ip), newlen); - ip->i_d.di_size = newlen; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta); -next_extent: - /* Process all the deferred stuff. */ - error = xfs_defer_finish(&tp); - if (error) - goto out_cancel; + /* Update dest isize if needed. */ + newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); + newlen = min_t(xfs_off_t, newlen, new_isize); + if (newlen > i_size_read(VFS_I(ip))) { + trace_xfs_reflink_update_inode_size(ip, newlen); + i_size_write(VFS_I(ip), newlen); + ip->i_d.di_size = newlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } + /* Commit everything and unlock. */ error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - goto out; - return 0; + goto out_unlock; out_cancel: xfs_trans_cancel(tp); +out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); out: - trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); + if (error) + trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); return error; } -/* - * Iteratively remap one file's extents (and holes) to another's. - */ +/* Remap a range of one file to the other. */ int xfs_reflink_remap_blocks( struct xfs_inode *src, @@ -1116,25 +1164,22 @@ xfs_reflink_remap_blocks( loff_t *remapped) { struct xfs_bmbt_irec imap; - xfs_fileoff_t srcoff; - xfs_fileoff_t destoff; + struct xfs_mount *mp = src->i_mount; + xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in); + xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out); xfs_filblks_t len; - xfs_filblks_t range_len; xfs_filblks_t remapped_len = 0; xfs_off_t new_isize = pos_out + remap_len; int nimaps; int error = 0; - destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); - srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); - len = XFS_B_TO_FSB(src->i_mount, remap_len); + len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), + XFS_MAX_FILEOFF); - /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ - while (len) { - uint lock_mode; + trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); - trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, - dest, destoff); + while (len > 0) { + unsigned int lock_mode; /* Read extent from the source file */ nimaps = 1; @@ -1143,18 +1188,25 @@ xfs_reflink_remap_blocks( xfs_iunlock(src, lock_mode); if (error) break; - ASSERT(nimaps == 1); - - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, - &imap); + /* + * The caller supposedly flushed all dirty pages in the source + * file range, which means that writeback should have allocated + * or deleted all delalloc reservations in that range. If we + * find one, that's a good sign that something is seriously + * wrong here. + */ + ASSERT(nimaps == 1 && imap.br_startoff == srcoff); + if (imap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + error = -EFSCORRUPTED; + break; + } - /* Translate imap into the destination file. */ - range_len = imap.br_startoff + imap.br_blockcount - srcoff; - imap.br_startoff += destoff - srcoff; + trace_xfs_reflink_remap_extent_src(src, &imap); - /* Clear dest from destoff to the end of imap and map it in. */ - error = xfs_reflink_remap_extent(dest, &imap, destoff, - new_isize); + /* Remap into the destination file at the given offset. */ + imap.br_startoff = destoff; + error = xfs_reflink_remap_extent(dest, &imap, new_isize); if (error) break; @@ -1164,10 +1216,10 @@ xfs_reflink_remap_blocks( } /* Advance drange/srange */ - srcoff += range_len; - destoff += range_len; - len -= range_len; - remapped_len += range_len; + srcoff += imap.br_blockcount; + destoff += imap.br_blockcount; + len -= imap.br_blockcount; + remapped_len += imap.br_blockcount; } if (error) @@ -1178,81 +1230,6 @@ xfs_reflink_remap_blocks( } /* - * Grab the exclusive iolock for a data copy from src to dest, making sure to - * abide vfs locking order (lowest pointer value goes first) and breaking the - * layout leases before proceeding. The loop is needed because we cannot call - * the blocking break_layout() with the iolocks held, and therefore have to - * back out both locks. - */ -static int -xfs_iolock_two_inodes_and_break_layout( - struct inode *src, - struct inode *dest) -{ - int error; - - if (src > dest) - swap(src, dest); - -retry: - /* Wait to break both inodes' layouts before we start locking. */ - error = break_layout(src, true); - if (error) - return error; - if (src != dest) { - error = break_layout(dest, true); - if (error) - return error; - } - - /* Lock one inode and make sure nobody got in and leased it. */ - inode_lock(src); - error = break_layout(src, false); - if (error) { - inode_unlock(src); - if (error == -EWOULDBLOCK) - goto retry; - return error; - } - - if (src == dest) - return 0; - - /* Lock the other inode and make sure nobody got in and leased it. */ - inode_lock_nested(dest, I_MUTEX_NONDIR2); - error = break_layout(dest, false); - if (error) { - inode_unlock(src); - inode_unlock(dest); - if (error == -EWOULDBLOCK) - goto retry; - return error; - } - - return 0; -} - -/* Unlock both inodes after they've been prepped for a range clone. */ -void -xfs_reflink_remap_unlock( - struct file *file_in, - struct file *file_out) -{ - struct inode *inode_in = file_inode(file_in); - struct xfs_inode *src = XFS_I(inode_in); - struct inode *inode_out = file_inode(file_out); - struct xfs_inode *dest = XFS_I(inode_out); - bool same_inode = (inode_in == inode_out); - - xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - if (!same_inode) - xfs_iunlock(src, XFS_MMAPLOCK_EXCL); - inode_unlock(inode_out); - if (!same_inode) - inode_unlock(inode_in); -} - -/* * If we're reflinking to a point past the destination file's EOF, we must * zero any speculative post-EOF preallocations that sit between the old EOF * and the destination file offset. @@ -1314,18 +1291,12 @@ xfs_reflink_remap_prep( struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); - bool same_inode = (inode_in == inode_out); - ssize_t ret; + int ret; /* Lock both files against IO */ - ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); + ret = xfs_ilock2_io_mmap(src, dest); if (ret) return ret; - if (same_inode) - xfs_ilock(src, XFS_MMAPLOCK_EXCL); - else - xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest, - XFS_MMAPLOCK_EXCL); /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; @@ -1339,7 +1310,7 @@ xfs_reflink_remap_prep( ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags); - if (ret < 0 || *len == 0) + if (ret || *len == 0) goto out_unlock; /* Attach dquots to dest inode before changing block map */ @@ -1374,9 +1345,9 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; - return 1; + return 0; out_unlock: - xfs_reflink_remap_unlock(file_in, file_out); + xfs_iunlock2_io_mmap(src, dest); return ret; } diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 3e4fd46373ab..487b00434b96 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -56,7 +56,5 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, loff_t *remapped); extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, unsigned int remap_flags); -extern void xfs_reflink_remap_unlock(struct file *file_in, - struct file *file_out); #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index a86599db20a6..dc5b0753cd51 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -141,7 +141,8 @@ xfs_rui_init( if (nextents > XFS_RUI_MAX_FAST_EXTENTS) ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); else - ruip = kmem_zone_zalloc(xfs_rui_zone, 0); + ruip = kmem_cache_zalloc(xfs_rui_zone, + GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; @@ -243,7 +244,7 @@ xfs_trans_get_rud( { struct xfs_rud_log_item *rudp; - rudp = kmem_zone_zalloc(xfs_rud_zone, 0); + rudp = kmem_cache_zalloc(xfs_rud_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops); rudp->rud_ruip = ruip; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 379cbff438bc..71ac6c1cdc36 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -890,9 +890,6 @@ xfs_quiesce_attr( /* force the log to unpin objects from the now complete transactions */ xfs_log_force(mp, XFS_LOG_SYNC); - /* reclaim inodes to do any IO before the freeze completes */ - xfs_reclaim_inodes(mp, 0); - xfs_reclaim_inodes(mp, SYNC_WAIT); /* Push the superblock and write an unmount record */ error = xfs_log_sbcount(mp); @@ -913,11 +910,21 @@ xfs_fs_freeze( struct super_block *sb) { struct xfs_mount *mp = XFS_M(sb); + unsigned int flags; + int ret; + /* + * The filesystem is now frozen far enough that memory reclaim + * cannot safely operate on the filesystem. Hence we need to + * set a GFP_NOFS context here to avoid recursion deadlocks. + */ + flags = memalloc_nofs_save(); xfs_stop_block_reaping(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return xfs_sync_sb(mp, true); + ret = xfs_sync_sb(mp, true); + memalloc_nofs_restore(flags); + return ret; } STATIC int @@ -1714,6 +1721,10 @@ xfs_fc_reconfigure( int flags = fc->sb_flags; int error; + /* version 5 superblocks always support version counters. */ + if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + fc->sb_flags |= SB_I_VERSION; + error = xfs_fc_validate_params(new_mp); if (error) return error; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 460136628a79..abb1d859f226 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -36,6 +36,7 @@ struct xfs_owner_info; struct xfs_trans_res; struct xfs_inobt_rec_incore; union xfs_btree_ptr; +struct xfs_dqtrx; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -864,44 +865,65 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, TP_STRUCT__entry( __field(dev_t, dev) __field(u32, id) + __field(xfs_dqtype_t, type) __field(unsigned, flags) __field(unsigned, nrefs) __field(unsigned long long, res_bcount) + __field(unsigned long long, res_rtbcount) + __field(unsigned long long, res_icount) + __field(unsigned long long, bcount) + __field(unsigned long long, rtbcount) __field(unsigned long long, icount) + __field(unsigned long long, blk_hardlimit) __field(unsigned long long, blk_softlimit) + __field(unsigned long long, rtb_hardlimit) + __field(unsigned long long, rtb_softlimit) __field(unsigned long long, ino_hardlimit) __field(unsigned long long, ino_softlimit) - ), \ + ), TP_fast_assign( __entry->dev = dqp->q_mount->m_super->s_dev; - __entry->id = be32_to_cpu(dqp->q_core.d_id); - __entry->flags = dqp->dq_flags; + __entry->id = dqp->q_id; + __entry->type = dqp->q_type; + __entry->flags = dqp->q_flags; __entry->nrefs = dqp->q_nrefs; - __entry->res_bcount = dqp->q_res_bcount; - __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); - __entry->icount = be64_to_cpu(dqp->q_core.d_icount); - __entry->blk_hardlimit = - be64_to_cpu(dqp->q_core.d_blk_hardlimit); - __entry->blk_softlimit = - be64_to_cpu(dqp->q_core.d_blk_softlimit); - __entry->ino_hardlimit = - be64_to_cpu(dqp->q_core.d_ino_hardlimit); - __entry->ino_softlimit = - be64_to_cpu(dqp->q_core.d_ino_softlimit); - ), - TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " + + __entry->res_bcount = dqp->q_blk.reserved; + __entry->res_rtbcount = dqp->q_rtb.reserved; + __entry->res_icount = dqp->q_ino.reserved; + + __entry->bcount = dqp->q_blk.count; + __entry->rtbcount = dqp->q_rtb.count; + __entry->icount = dqp->q_ino.count; + + __entry->blk_hardlimit = dqp->q_blk.hardlimit; + __entry->blk_softlimit = dqp->q_blk.softlimit; + __entry->rtb_hardlimit = dqp->q_rtb.hardlimit; + __entry->rtb_softlimit = dqp->q_rtb.softlimit; + __entry->ino_hardlimit = dqp->q_ino.hardlimit; + __entry->ino_softlimit = dqp->q_ino.softlimit; + ), + TP_printk("dev %d:%d id 0x%x type %s flags %s nrefs %u " + "res_bc 0x%llx res_rtbc 0x%llx res_ic 0x%llx " "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " + "rtbcnt 0x%llx rtbhardlimit 0x%llx rtbsoftlimit 0x%llx " "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->id, - __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), __entry->nrefs, __entry->res_bcount, + __entry->res_rtbcount, + __entry->res_icount, __entry->bcount, __entry->blk_hardlimit, __entry->blk_softlimit, + __entry->rtbcount, + __entry->rtb_hardlimit, + __entry->rtb_softlimit, __entry->icount, __entry->ino_hardlimit, __entry->ino_softlimit) @@ -932,6 +954,125 @@ DEFINE_DQUOT_EVENT(xfs_dqrele); DEFINE_DQUOT_EVENT(xfs_dqflush); DEFINE_DQUOT_EVENT(xfs_dqflush_force); DEFINE_DQUOT_EVENT(xfs_dqflush_done); +DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before); +DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after); + +#define XFS_QMOPT_FLAGS \ + { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ + { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ + { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ + { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ + { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ + { XFS_QMOPT_INHERIT, "INHERIT" }, \ + { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ + { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ + { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ + { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ + { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ + { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ + { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ + { XFS_QMOPT_RES_INOS, "RES_INOS" } + +TRACE_EVENT(xfs_trans_mod_dquot, + TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp, + unsigned int field, int64_t delta), + TP_ARGS(tp, dqp, field, delta), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, type) + __field(unsigned int, flags) + __field(unsigned int, dqid) + __field(unsigned int, field) + __field(int64_t, delta) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->type = dqp->q_type; + __entry->flags = dqp->q_flags; + __entry->dqid = dqp->q_id; + __entry->field = field; + __entry->delta = delta; + ), + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s field %s delta %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dqid, + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), + __print_flags(__entry->field, "|", XFS_QMOPT_FLAGS), + __entry->delta) +); + +DECLARE_EVENT_CLASS(xfs_dqtrx_class, + TP_PROTO(struct xfs_dqtrx *qtrx), + TP_ARGS(qtrx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, type) + __field(unsigned int, flags) + __field(u32, dqid) + + __field(uint64_t, blk_res) + __field(int64_t, bcount_delta) + __field(int64_t, delbcnt_delta) + + __field(uint64_t, rtblk_res) + __field(uint64_t, rtblk_res_used) + __field(int64_t, rtbcount_delta) + __field(int64_t, delrtb_delta) + + __field(uint64_t, ino_res) + __field(uint64_t, ino_res_used) + __field(int64_t, icount_delta) + ), + TP_fast_assign( + __entry->dev = qtrx->qt_dquot->q_mount->m_super->s_dev; + __entry->type = qtrx->qt_dquot->q_type; + __entry->flags = qtrx->qt_dquot->q_flags; + __entry->dqid = qtrx->qt_dquot->q_id; + + __entry->blk_res = qtrx->qt_blk_res; + __entry->bcount_delta = qtrx->qt_bcount_delta; + __entry->delbcnt_delta = qtrx->qt_delbcnt_delta; + + __entry->rtblk_res = qtrx->qt_rtblk_res; + __entry->rtblk_res_used = qtrx->qt_rtblk_res_used; + __entry->rtbcount_delta = qtrx->qt_rtbcount_delta; + __entry->delrtb_delta = qtrx->qt_delrtb_delta; + + __entry->ino_res = qtrx->qt_ino_res; + __entry->ino_res_used = qtrx->qt_ino_res_used; + __entry->icount_delta = qtrx->qt_icount_delta; + ), + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s" + "blk_res %llu bcount_delta %lld delbcnt_delta %lld " + "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld " + "ino_res %llu ino_res_used %llu icount_delta %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dqid, + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), + + __entry->blk_res, + __entry->bcount_delta, + __entry->delbcnt_delta, + + __entry->rtblk_res, + __entry->rtblk_res_used, + __entry->rtbcount_delta, + __entry->delrtb_delta, + + __entry->ino_res, + __entry->ino_res_used, + __entry->icount_delta) +) + +#define DEFINE_DQTRX_EVENT(name) \ +DEFINE_EVENT(xfs_dqtrx_class, name, \ + TP_PROTO(struct xfs_dqtrx *qtrx), \ + TP_ARGS(qtrx)) +DEFINE_DQTRX_EVENT(xfs_trans_apply_dquot_deltas); +DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_before); +DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_after); DECLARE_EVENT_CLASS(xfs_loggrant_class, TP_PROTO(struct xlog *log, struct xlog_ticket *tic), @@ -3052,8 +3193,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); -DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); -TRACE_EVENT(xfs_reflink_remap_blocks_loop, +TRACE_EVENT(xfs_reflink_remap_blocks, TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, xfs_filblks_t len, struct xfs_inode *dest, xfs_fileoff_t doffset), @@ -3084,59 +3224,14 @@ TRACE_EVENT(xfs_reflink_remap_blocks_loop, __entry->dest_ino, __entry->dest_lblk) ); -TRACE_EVENT(xfs_reflink_punch_range, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, - xfs_extlen_t len), - TP_ARGS(ip, lblk, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_extlen_t, len) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->len = len; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->len) -); -TRACE_EVENT(xfs_reflink_remap, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, - xfs_extlen_t len, xfs_fsblock_t new_pblk), - TP_ARGS(ip, lblk, len, new_pblk), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_extlen_t, len) - __field(xfs_fsblock_t, new_pblk) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->len = len; - __entry->new_pblk = new_pblk; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->len, - __entry->new_pblk) -); DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); +DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src); +DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest); /* dedupe tracepoints */ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); @@ -3582,7 +3677,6 @@ DEFINE_KMEM_EVENT(kmem_alloc); DEFINE_KMEM_EVENT(kmem_alloc_io); DEFINE_KMEM_EVENT(kmem_alloc_large); DEFINE_KMEM_EVENT(kmem_realloc); -DEFINE_KMEM_EVENT(kmem_zone_alloc); TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 3c94e5ff4316..ed72867b1a19 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -90,7 +90,7 @@ xfs_trans_dup( trace_xfs_trans_dup(tp, _RET_IP_); - ntp = kmem_zone_zalloc(xfs_trans_zone, 0); + ntp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); /* * Initialize the new transaction structure. @@ -107,7 +107,8 @@ xfs_trans_dup( ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE) | - (tp->t_flags & XFS_TRANS_NO_WRITECOUNT); + (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) | + (tp->t_flags & XFS_TRANS_RES_FDBLKS); /* We gave our writer reference to the new transaction */ tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); @@ -262,7 +263,7 @@ xfs_trans_alloc( * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ - tp = kmem_zone_zalloc(xfs_trans_zone, 0); + tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); @@ -272,6 +273,8 @@ xfs_trans_alloc( */ WARN_ON(resp->tr_logres > 0 && mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || + xfs_sb_version_haslazysbcount(&mp->m_sb)); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; @@ -365,6 +368,20 @@ xfs_trans_mod_sb( tp->t_blk_res_used += (uint)-delta; if (tp->t_blk_res_used > tp->t_blk_res) xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) { + int64_t blkres_delta; + + /* + * Return freed blocks directly to the reservation + * instead of the global pool, being careful not to + * overflow the trans counter. This is used to preserve + * reservation across chains of transaction rolls that + * repeatedly free and allocate blocks. + */ + blkres_delta = min_t(int64_t, delta, + UINT_MAX - tp->t_blk_res); + tp->t_blk_res += blkres_delta; + delta -= blkres_delta; } tp->t_fdblocks_delta += delta; if (xfs_sb_version_haslazysbcount(&mp->m_sb)) diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 8308bf6d7e40..b752501818d2 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -37,10 +37,6 @@ struct xfs_log_item { unsigned long li_flags; /* misc flags */ struct xfs_buf *li_buf; /* real buffer pointer */ struct list_head li_bio_list; /* buffer item list */ - void (*li_cb)(struct xfs_buf *, - struct xfs_log_item *); - /* buffer item iodone */ - /* callback func */ const struct xfs_item_ops *li_ops; /* function list */ /* delayed logging */ @@ -78,7 +74,6 @@ struct xfs_item_ops { void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); - void (*iop_error)(struct xfs_log_item *, xfs_buf_t *); int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); }; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index ac5019361a13..0c783d339675 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -377,8 +377,12 @@ xfsaild_resubmit_item( } /* protected by ail_lock */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) - xfs_clear_li_failed(lip); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { + if (bp->b_flags & _XBF_INODES) + clear_bit(XFS_LI_FAILED, &lip->li_flags); + else + xfs_clear_li_failed(lip); + } xfs_buf_unlock(bp); return XFS_ITEM_SUCCESS; @@ -444,16 +448,10 @@ xfsaild_push( target = ailp->ail_target; ailp->ail_target_prev = target; + /* we're done if the AIL is empty or our push has reached the end */ lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); - if (!lip) { - /* - * If the AIL is empty or our push has reached the end we are - * done now. - */ - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + if (!lip) goto out_done; - } XFS_STATS_INC(mp, xs_push_ail); @@ -535,6 +533,8 @@ xfsaild_push( break; lsn = lip->li_lsn; } + +out_done: xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->ail_lock); @@ -542,7 +542,6 @@ xfsaild_push( ailp->ail_log_flush++; if (!count || XFS_LSN_CMP(lsn, target) >= 0) { -out_done: /* * We reached the target or the AIL is empty, so wait a bit * longer for I/O to complete and remove pushed items from the @@ -634,7 +633,8 @@ xfsaild( */ smp_rmb(); if (!xfs_ail_min(ailp) && - ailp->ail_target == ailp->ail_target_prev) { + ailp->ail_target == ailp->ail_target_prev && + list_empty(&ailp->ail_buf_list)) { spin_unlock(&ailp->ail_lock); freezable_schedule(); tout = 0; @@ -843,7 +843,6 @@ xfs_ail_delete_one( trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); - xfs_clear_li_failed(lip); clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; @@ -874,6 +873,7 @@ xfs_trans_ail_delete( } /* xfs_ail_update_finish() drops the AIL lock */ + xfs_clear_li_failed(lip); tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 08174ffa2118..11cd666cd99a 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -465,24 +465,16 @@ xfs_trans_dirty_buf( ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); - ASSERT(bp->b_iodone == NULL || - bp->b_iodone == xfs_buf_iodone_callbacks); /* * Mark the buffer as needing to be written out eventually, * and set its iodone function to remove the buffer's buf log * item from the AIL and free it when the buffer is flushed - * to disk. See xfs_buf_attach_iodone() for more details - * on li_cb and xfs_buf_iodone_callbacks(). - * If we end up aborting this transaction, we trap this buffer - * inside the b_bdstrat callback so that this won't get written to - * disk. + * to disk. */ bp->b_flags |= XBF_DONE; ASSERT(atomic_read(&bip->bli_refcount) > 0); - bp->b_iodone = xfs_buf_iodone_callbacks; - bip->bli_item.li_cb = xfs_buf_iodone; /* * If we invalidated the buffer within this transaction, then @@ -626,6 +618,7 @@ xfs_trans_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; + bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } @@ -650,7 +643,7 @@ xfs_trans_stale_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; - bip->bli_item.li_cb = xfs_buf_iodone; + bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } @@ -675,6 +668,7 @@ xfs_trans_inode_alloc_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; + bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } @@ -785,5 +779,6 @@ xfs_trans_dquot_buf( break; } + bp->b_flags |= _XBF_DQUOTS; xfs_trans_buf_set_type(tp, bp, type); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c0f73b82c055..c6ba7ef18e06 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -15,6 +15,7 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_qm.h" +#include "xfs_trace.h" STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); @@ -155,14 +156,19 @@ xfs_trans_get_dqtrx( int i; struct xfs_dqtrx *qa; - if (XFS_QM_ISUDQ(dqp)) + switch (xfs_dquot_type(dqp)) { + case XFS_DQTYPE_USER: qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR]; - else if (XFS_QM_ISGDQ(dqp)) + break; + case XFS_DQTYPE_GROUP: qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP]; - else if (XFS_QM_ISPDQ(dqp)) + break; + case XFS_DQTYPE_PROJ: qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ]; - else + break; + default: return NULL; + } for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (qa[i].qt_dquot == NULL || @@ -203,6 +209,11 @@ xfs_trans_mod_dquot( if (qtrx->qt_dquot == NULL) qtrx->qt_dquot = dqp; + if (delta) { + trace_xfs_trans_mod_dquot_before(qtrx); + trace_xfs_trans_mod_dquot(tp, dqp, field, delta); + } + switch (field) { /* @@ -266,6 +277,10 @@ xfs_trans_mod_dquot( default: ASSERT(0); } + + if (delta) + trace_xfs_trans_mod_dquot_after(qtrx); + tp->t_flags |= XFS_TRANS_DQ_DIRTY; } @@ -293,6 +308,37 @@ xfs_trans_dqlockedjoin( } } +/* Apply dqtrx changes to the quota reservation counters. */ +static inline void +xfs_apply_quota_reservation_deltas( + struct xfs_dquot_res *res, + uint64_t reserved, + int64_t res_used, + int64_t count_delta) +{ + if (reserved != 0) { + /* + * Subtle math here: If reserved > res_used (the normal case), + * we're simply subtracting the unused transaction quota + * reservation from the dquot reservation. + * + * If, however, res_used > reserved, then we have allocated + * more quota blocks than were reserved for the transaction. + * We must add that excess to the dquot reservation since it + * tracks (usage + resv) and by definition we didn't reserve + * that excess. + */ + res->reserved -= abs(reserved - res_used); + } else if (count_delta != 0) { + /* + * These blks were never reserved, either inside a transaction + * or outside one (in a delayed allocation). Also, this isn't + * always a negative number since we sometimes deliberately + * skip quota reservations. + */ + res->reserved += count_delta; + } +} /* * Called by xfs_trans_commit() and similar in spirit to @@ -309,7 +355,6 @@ xfs_trans_apply_dquot_deltas( int i, j; struct xfs_dquot *dqp; struct xfs_dqtrx *qtrx, *qa; - struct xfs_disk_dquot *d; int64_t totalbdelta; int64_t totalrtbdelta; @@ -328,6 +373,8 @@ xfs_trans_apply_dquot_deltas( xfs_trans_dqlockedjoin(tp, qa); for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + uint64_t blk_res_used; + qtrx = &qa[i]; /* * The array of dquots is filled @@ -341,7 +388,6 @@ xfs_trans_apply_dquot_deltas( /* * adjust the actual number of blocks used */ - d = &dqp->q_core; /* * The issue here is - sometimes we don't make a blkquota @@ -360,38 +406,46 @@ xfs_trans_apply_dquot_deltas( qtrx->qt_delbcnt_delta; totalrtbdelta = qtrx->qt_rtbcount_delta + qtrx->qt_delrtb_delta; + + if (totalbdelta != 0 || totalrtbdelta != 0 || + qtrx->qt_icount_delta != 0) { + trace_xfs_trans_apply_dquot_deltas_before(dqp); + trace_xfs_trans_apply_dquot_deltas(qtrx); + } + #ifdef DEBUG if (totalbdelta < 0) - ASSERT(be64_to_cpu(d->d_bcount) >= - -totalbdelta); + ASSERT(dqp->q_blk.count >= -totalbdelta); if (totalrtbdelta < 0) - ASSERT(be64_to_cpu(d->d_rtbcount) >= - -totalrtbdelta); + ASSERT(dqp->q_rtb.count >= -totalrtbdelta); if (qtrx->qt_icount_delta < 0) - ASSERT(be64_to_cpu(d->d_icount) >= - -qtrx->qt_icount_delta); + ASSERT(dqp->q_ino.count >= -qtrx->qt_icount_delta); #endif if (totalbdelta) - be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta); + dqp->q_blk.count += totalbdelta; if (qtrx->qt_icount_delta) - be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta); + dqp->q_ino.count += qtrx->qt_icount_delta; if (totalrtbdelta) - be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta); + dqp->q_rtb.count += totalrtbdelta; + + if (totalbdelta != 0 || totalrtbdelta != 0 || + qtrx->qt_icount_delta != 0) + trace_xfs_trans_apply_dquot_deltas_after(dqp); /* * Get any default limits in use. * Start/reset the timer(s) if needed. */ - if (d->d_id) { - xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); - xfs_qm_adjust_dqtimers(tp->t_mountp, dqp); + if (dqp->q_id) { + xfs_qm_adjust_dqlimits(dqp); + xfs_qm_adjust_dqtimers(dqp); } - dqp->dq_flags |= XFS_DQ_DIRTY; + dqp->q_flags |= XFS_DQFLAG_DIRTY; /* * add this to the list of items to get logged */ @@ -401,78 +455,31 @@ xfs_trans_apply_dquot_deltas( * In case of delayed allocations, there's no * reservation that a transaction structure knows of. */ - if (qtrx->qt_blk_res != 0) { - uint64_t blk_res_used = 0; - - if (qtrx->qt_bcount_delta > 0) - blk_res_used = qtrx->qt_bcount_delta; - - if (qtrx->qt_blk_res != blk_res_used) { - if (qtrx->qt_blk_res > blk_res_used) - dqp->q_res_bcount -= (xfs_qcnt_t) - (qtrx->qt_blk_res - - blk_res_used); - else - dqp->q_res_bcount -= (xfs_qcnt_t) - (blk_res_used - - qtrx->qt_blk_res); - } - } else { - /* - * These blks were never reserved, either inside - * a transaction or outside one (in a delayed - * allocation). Also, this isn't always a - * negative number since we sometimes - * deliberately skip quota reservations. - */ - if (qtrx->qt_bcount_delta) { - dqp->q_res_bcount += - (xfs_qcnt_t)qtrx->qt_bcount_delta; - } - } + blk_res_used = max_t(int64_t, 0, qtrx->qt_bcount_delta); + xfs_apply_quota_reservation_deltas(&dqp->q_blk, + qtrx->qt_blk_res, blk_res_used, + qtrx->qt_bcount_delta); + /* * Adjust the RT reservation. */ - if (qtrx->qt_rtblk_res != 0) { - if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) { - if (qtrx->qt_rtblk_res > - qtrx->qt_rtblk_res_used) - dqp->q_res_rtbcount -= (xfs_qcnt_t) - (qtrx->qt_rtblk_res - - qtrx->qt_rtblk_res_used); - else - dqp->q_res_rtbcount -= (xfs_qcnt_t) - (qtrx->qt_rtblk_res_used - - qtrx->qt_rtblk_res); - } - } else { - if (qtrx->qt_rtbcount_delta) - dqp->q_res_rtbcount += - (xfs_qcnt_t)qtrx->qt_rtbcount_delta; - } + xfs_apply_quota_reservation_deltas(&dqp->q_rtb, + qtrx->qt_rtblk_res, + qtrx->qt_rtblk_res_used, + qtrx->qt_rtbcount_delta); /* * Adjust the inode reservation. */ - if (qtrx->qt_ino_res != 0) { - ASSERT(qtrx->qt_ino_res >= - qtrx->qt_ino_res_used); - if (qtrx->qt_ino_res > qtrx->qt_ino_res_used) - dqp->q_res_icount -= (xfs_qcnt_t) - (qtrx->qt_ino_res - - qtrx->qt_ino_res_used); - } else { - if (qtrx->qt_icount_delta) - dqp->q_res_icount += - (xfs_qcnt_t)qtrx->qt_icount_delta; - } - - ASSERT(dqp->q_res_bcount >= - be64_to_cpu(dqp->q_core.d_bcount)); - ASSERT(dqp->q_res_icount >= - be64_to_cpu(dqp->q_core.d_icount)); - ASSERT(dqp->q_res_rtbcount >= - be64_to_cpu(dqp->q_core.d_rtbcount)); + ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); + xfs_apply_quota_reservation_deltas(&dqp->q_ino, + qtrx->qt_ino_res, + qtrx->qt_ino_res_used, + qtrx->qt_icount_delta); + + ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); + ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); + ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); } } } @@ -516,7 +523,7 @@ xfs_trans_unreserve_and_mod_dquots( if (qtrx->qt_blk_res) { xfs_dqlock(dqp); locked = true; - dqp->q_res_bcount -= + dqp->q_blk.reserved -= (xfs_qcnt_t)qtrx->qt_blk_res; } if (qtrx->qt_ino_res) { @@ -524,7 +531,7 @@ xfs_trans_unreserve_and_mod_dquots( xfs_dqlock(dqp); locked = true; } - dqp->q_res_icount -= + dqp->q_ino.reserved -= (xfs_qcnt_t)qtrx->qt_ino_res; } @@ -533,7 +540,7 @@ xfs_trans_unreserve_and_mod_dquots( xfs_dqlock(dqp); locked = true; } - dqp->q_res_rtbcount -= + dqp->q_rtb.reserved -= (xfs_qcnt_t)qtrx->qt_rtblk_res; } if (locked) @@ -549,21 +556,80 @@ xfs_quota_warn( struct xfs_dquot *dqp, int type) { - enum quota_type qtype; + enum quota_type qtype; - if (dqp->dq_flags & XFS_DQ_PROJ) + switch (xfs_dquot_type(dqp)) { + case XFS_DQTYPE_PROJ: qtype = PRJQUOTA; - else if (dqp->dq_flags & XFS_DQ_USER) + break; + case XFS_DQTYPE_USER: qtype = USRQUOTA; - else + break; + case XFS_DQTYPE_GROUP: qtype = GRPQUOTA; + break; + default: + return; + } - quota_send_warning(make_kqid(&init_user_ns, qtype, - be32_to_cpu(dqp->q_core.d_id)), + quota_send_warning(make_kqid(&init_user_ns, qtype, dqp->q_id), mp->m_super->s_dev, type); } /* + * Decide if we can make an additional reservation against a quota resource. + * Returns an inode QUOTA_NL_ warning code and whether or not it's fatal. + * + * Note that we assume that the numeric difference between the inode and block + * warning codes will always be 3 since it's userspace ABI now, and will never + * decrease the quota reservation, so the *BELOW messages are irrelevant. + */ +static inline int +xfs_dqresv_check( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + int64_t delta, + bool *fatal) +{ + xfs_qcnt_t hardlimit = res->hardlimit; + xfs_qcnt_t softlimit = res->softlimit; + xfs_qcnt_t total_count = res->reserved + delta; + + BUILD_BUG_ON(QUOTA_NL_BHARDWARN != QUOTA_NL_IHARDWARN + 3); + BUILD_BUG_ON(QUOTA_NL_BSOFTLONGWARN != QUOTA_NL_ISOFTLONGWARN + 3); + BUILD_BUG_ON(QUOTA_NL_BSOFTWARN != QUOTA_NL_ISOFTWARN + 3); + + *fatal = false; + if (delta <= 0) + return QUOTA_NL_NOWARN; + + if (!hardlimit) + hardlimit = qlim->hard; + if (!softlimit) + softlimit = qlim->soft; + + if (hardlimit && total_count > hardlimit) { + *fatal = true; + return QUOTA_NL_IHARDWARN; + } + + if (softlimit && total_count > softlimit) { + time64_t now = ktime_get_real_seconds(); + + if ((res->timer != 0 && now > res->timer) || + (res->warnings != 0 && res->warnings >= qlim->warn)) { + *fatal = true; + return QUOTA_NL_ISOFTLONGWARN; + } + + res->warnings++; + return QUOTA_NL_ISOFTWARN; + } + + return QUOTA_NL_NOWARN; +} + +/* * This reserves disk blocks and inodes against a dquot. * Flags indicate if the dquot is to be locked here and also * if the blk reservation is for RT or regular blocks. @@ -578,110 +644,58 @@ xfs_trans_dqresv( long ninos, uint flags) { - xfs_qcnt_t hardlimit; - xfs_qcnt_t softlimit; - time64_t timer; - xfs_qwarncnt_t warns; - xfs_qwarncnt_t warnlimit; - xfs_qcnt_t total_count; - xfs_qcnt_t *resbcountp; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_def_quota *defq; - + struct xfs_dquot_res *blkres; + struct xfs_quota_limits *qlim; xfs_dqlock(dqp); defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); if (flags & XFS_TRANS_DQ_RES_BLKS) { - hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); - if (!hardlimit) - hardlimit = defq->bhardlimit; - softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit); - if (!softlimit) - softlimit = defq->bsoftlimit; - timer = be32_to_cpu(dqp->q_core.d_btimer); - warns = be16_to_cpu(dqp->q_core.d_bwarns); - warnlimit = defq->bwarnlimit; - resbcountp = &dqp->q_res_bcount; + blkres = &dqp->q_blk; + qlim = &defq->blk; } else { - ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); - hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit); - if (!hardlimit) - hardlimit = defq->rtbhardlimit; - softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit); - if (!softlimit) - softlimit = defq->rtbsoftlimit; - timer = be32_to_cpu(dqp->q_core.d_rtbtimer); - warns = be16_to_cpu(dqp->q_core.d_rtbwarns); - warnlimit = defq->rtbwarnlimit; - resbcountp = &dqp->q_res_rtbcount; + blkres = &dqp->q_rtb; + qlim = &defq->rtb; } - if ((flags & XFS_QMOPT_FORCE_RES) == 0 && - dqp->q_core.d_id && - ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || - (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || - (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { - if (nblks > 0) { + if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_id && + xfs_dquot_is_enforced(dqp)) { + int quota_nl; + bool fatal; + + /* + * dquot is locked already. See if we'd go over the hardlimit + * or exceed the timelimit if we'd reserve resources. + */ + quota_nl = xfs_dqresv_check(blkres, qlim, nblks, &fatal); + if (quota_nl != QUOTA_NL_NOWARN) { /* - * dquot is locked already. See if we'd go over the - * hardlimit or exceed the timelimit if we allocate - * nblks. + * Quota block warning codes are 3 more than the inode + * codes, which we check above. */ - total_count = *resbcountp + nblks; - if (hardlimit && total_count > hardlimit) { - xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); + xfs_quota_warn(mp, dqp, quota_nl + 3); + if (fatal) goto error_return; - } - if (softlimit && total_count > softlimit) { - if ((timer != 0 && - ktime_get_real_seconds() > timer) || - (warns != 0 && warns >= warnlimit)) { - xfs_quota_warn(mp, dqp, - QUOTA_NL_BSOFTLONGWARN); - goto error_return; - } - - xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN); - } } - if (ninos > 0) { - total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos; - timer = be32_to_cpu(dqp->q_core.d_itimer); - warns = be16_to_cpu(dqp->q_core.d_iwarns); - warnlimit = defq->iwarnlimit; - hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); - if (!hardlimit) - hardlimit = defq->ihardlimit; - softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); - if (!softlimit) - softlimit = defq->isoftlimit; - - if (hardlimit && total_count > hardlimit) { - xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); + + quota_nl = xfs_dqresv_check(&dqp->q_ino, &defq->ino, ninos, + &fatal); + if (quota_nl != QUOTA_NL_NOWARN) { + xfs_quota_warn(mp, dqp, quota_nl); + if (fatal) goto error_return; - } - if (softlimit && total_count > softlimit) { - if ((timer != 0 && - ktime_get_real_seconds() > timer) || - (warns != 0 && warns >= warnlimit)) { - xfs_quota_warn(mp, dqp, - QUOTA_NL_ISOFTLONGWARN); - goto error_return; - } - xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN); - } } } /* * Change the reservation, but not the actual usage. - * Note that q_res_bcount = q_core.d_bcount + resv + * Note that q_blk.reserved = q_blk.count + resv */ - (*resbcountp) += (xfs_qcnt_t)nblks; - if (ninos != 0) - dqp->q_res_icount += (xfs_qcnt_t)ninos; + blkres->reserved += (xfs_qcnt_t)nblks; + dqp->q_ino.reserved += (xfs_qcnt_t)ninos; /* * note the reservation amt in the trans struct too, @@ -702,16 +716,16 @@ xfs_trans_dqresv( XFS_TRANS_DQ_RES_INOS, ninos); } - ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount)); - ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); - ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); + ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); + ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); + ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); xfs_dqunlock(dqp); return 0; error_return: xfs_dqunlock(dqp); - if (XFS_QM_ISPDQ(dqp)) + if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ) return -ENOSPC; return -EDQUOT; } @@ -860,7 +874,8 @@ STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp) { - tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0); + tp->t_dqinfo = kmem_cache_zalloc(xfs_qm_dqtrxzone, + GFP_KERNEL | __GFP_NOFAIL); } void |