diff options
Diffstat (limited to 'fs/xfs/xfs_icache.c')
-rw-r--r-- | fs/xfs/xfs_icache.c | 181 |
1 files changed, 118 insertions, 63 deletions
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 57a9f2317525..6b119a7a324f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -24,6 +24,7 @@ #include "xfs_ialloc.h" #include "xfs_ag.h" #include "xfs_log_priv.h" +#include "xfs_health.h" #include <linux/iversion.h> @@ -64,6 +65,18 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, XFS_ICWALK_FLAG_RECLAIM_SICK | \ XFS_ICWALK_FLAG_UNION) +/* Marks for the perag xarray */ +#define XFS_PERAG_RECLAIM_MARK XA_MARK_0 +#define XFS_PERAG_BLOCKGC_MARK XA_MARK_1 + +static inline xa_mark_t ici_tag_to_mark(unsigned int tag) +{ + if (tag == XFS_ICI_RECLAIM_TAG) + return XFS_PERAG_RECLAIM_MARK; + ASSERT(tag == XFS_ICI_BLOCKGC_TAG); + return XFS_PERAG_BLOCKGC_MARK; +} + /* * Allocate and initialise an xfs_inode. */ @@ -85,10 +98,10 @@ xfs_inode_alloc( return NULL; } - /* VFS doesn't initialise i_mode or i_state! */ + /* VFS doesn't initialise i_mode! */ VFS_I(ip)->i_mode = 0; - VFS_I(ip)->i_state = 0; - mapping_set_large_folios(VFS_I(ip)->i_mapping); + mapping_set_folio_min_order(VFS_I(ip)->i_mapping, + M_IGEO(mp)->min_folio_order); XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -191,7 +204,7 @@ xfs_reclaim_work_queue( { rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + if (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) { queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } @@ -241,9 +254,7 @@ xfs_perag_set_inode_tag( return; /* propagate the tag up into the perag radix tree */ - spin_lock(&mp->m_perag_lock); - radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag); - spin_unlock(&mp->m_perag_lock); + xa_set_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag)); /* start background work */ switch (tag) { @@ -285,14 +296,39 @@ xfs_perag_clear_inode_tag( return; /* clear the tag from the perag radix tree */ - spin_lock(&mp->m_perag_lock); - radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag); - spin_unlock(&mp->m_perag_lock); + xa_clear_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag)); trace_xfs_perag_clear_inode_tag(pag, _RET_IP_); } /* + * Find the next AG after @pag, or the first AG if @pag is NULL. + */ +static struct xfs_perag * +xfs_perag_grab_next_tag( + struct xfs_mount *mp, + struct xfs_perag *pag, + int tag) +{ + unsigned long index = 0; + + if (pag) { + index = pag->pag_agno + 1; + xfs_perag_rele(pag); + } + + rcu_read_lock(); + pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag)); + if (pag) { + trace_xfs_perag_grab_next_tag(pag, _RET_IP_); + if (!atomic_inc_not_zero(&pag->pag_active_ref)) + pag = NULL; + } + rcu_read_unlock(); + return pag; +} + +/* * When we recycle a reclaimable inode, we need to re-initialise the VFS inode * part of the structure. This is made more complex by the fact we store * information about the on-disk values in the VFS inode and so we can't just @@ -313,6 +349,7 @@ xfs_reinit_inode( dev_t dev = inode->i_rdev; kuid_t uid = inode->i_uid; kgid_t gid = inode->i_gid; + unsigned long state = inode->i_state; error = inode_init_always(mp->m_super, inode); @@ -323,7 +360,9 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; - mapping_set_large_folios(inode->i_mapping); + inode->i_state = state; + mapping_set_folio_min_order(inode->i_mapping, + M_IGEO(mp)->min_folio_order); return error; } @@ -415,6 +454,9 @@ xfs_iget_check_free_state( xfs_warn(ip->i_mount, "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", ip->i_ino, VFS_I(ip)->i_mode); + xfs_agno_mark_sick(ip->i_mount, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_SICK_AG_INOBT); return -EFSCORRUPTED; } @@ -422,6 +464,9 @@ xfs_iget_check_free_state( xfs_warn(ip->i_mount, "Corruption detected! Free inode 0x%llx has blocks allocated!", ip->i_ino); + xfs_agno_mark_sick(ip->i_mount, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_SICK_AG_INOBT); return -EFSCORRUPTED; } return 0; @@ -606,7 +651,6 @@ xfs_iget_cache_miss( struct xfs_inode *ip; int error; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); - int iflags; ip = xfs_inode_alloc(mp, ino); if (!ip) @@ -640,6 +684,8 @@ xfs_iget_cache_miss( xfs_buf_offset(bp, ip->i_imap.im_boffset)); if (!error) xfs_buf_set_ref(bp, XFS_INO_REF); + else + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); xfs_trans_brelse(tp, bp); if (error) @@ -659,10 +705,9 @@ xfs_iget_cache_miss( /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload - * region. Since we can be called from transaction context, don't - * recurse into the file system. + * region. */ - if (radix_tree_preload(GFP_NOFS)) { + if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) { error = -EAGAIN; goto out_destroy; } @@ -685,13 +730,12 @@ xfs_iget_cache_miss( * memory barrier that ensures this detection works correctly at lookup * time. */ - iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) d_mark_dontcache(VFS_I(ip)); ip->i_udquot = NULL; ip->i_gdquot = NULL; ip->i_pdquot = NULL; - xfs_iflags_set(ip, iflags); + xfs_iflags_set(ip, XFS_INEW); /* insert the new inode */ spin_lock(&pag->pag_ici_lock); @@ -748,7 +792,7 @@ xfs_iget( ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); /* reject inode numbers outside existing AGs */ - if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + if (!xfs_verify_ino(mp, ino)) return -EINVAL; XFS_STATS_INC(mp, xs_ig_attempts); @@ -970,7 +1014,7 @@ xfs_reclaim_inodes( if (xfs_want_reclaim_sick(mp)) icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; - while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + while (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) { xfs_ail_push_all_sync(mp->m_ail); xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); } @@ -1012,15 +1056,17 @@ long xfs_reclaim_inodes_count( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t ag = 0; + XA_STATE (xas, &mp->m_perags, 0); long reclaimable = 0; + struct xfs_perag *pag; - while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { - ag = pag->pag_agno + 1; + rcu_read_lock(); + xas_for_each_marked(&xas, pag, ULONG_MAX, XFS_PERAG_RECLAIM_MARK) { + trace_xfs_reclaim_inodes_count(pag, _THIS_IP_); reclaimable += pag->pag_ici_reclaimable; - xfs_perag_put(pag); } + rcu_read_unlock(); + return reclaimable; } @@ -1152,7 +1198,7 @@ xfs_inode_free_eofblocks( if (xfs_can_free_eofblocks(ip)) return xfs_free_eofblocks(ip); - /* inode could be preallocated or append-only */ + /* inode could be preallocated */ trace_xfs_inode_free_eofblocks_invalid(ip); xfs_inode_clear_eofblocks_tag(ip); return 0; @@ -1234,14 +1280,17 @@ xfs_inode_clear_eofblocks_tag( } /* - * Set ourselves up to free CoW blocks from this file. If it's already clean - * then we can bail out quickly, but otherwise we must back off if the file - * is undergoing some kind of write. + * Prepare to free COW fork blocks from an inode. */ static bool xfs_prep_free_cowblocks( - struct xfs_inode *ip) + struct xfs_inode *ip, + struct xfs_icwalk *icw) { + bool sync; + + sync = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); + /* * Just clear the tag if we have an empty cow fork or none at all. It's * possible the inode was fully unshared since it was originally tagged. @@ -1253,16 +1302,22 @@ xfs_prep_free_cowblocks( } /* - * If the mapping is dirty or under writeback we cannot touch the - * CoW fork. Leave it alone if we're in the midst of a directio. + * A cowblocks trim of an inode can have a significant effect on + * fragmentation even when a reasonable COW extent size hint is set. + * Therefore, we prefer to not process cowblocks unless they are clean + * and idle. We can never process a cowblocks inode that is dirty or has + * in-flight I/O under any circumstances, because outstanding writeback + * or dio expects targeted COW fork blocks exist through write + * completion where they can be remapped into the data fork. + * + * Therefore, the heuristic used here is to never process inodes + * currently opened for write from background (i.e. non-sync) scans. For + * sync scans, use the pagecache/dio state of the inode to ensure we + * never free COW fork blocks out from under pending I/O. */ - if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || - mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || - mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || - atomic_read(&VFS_I(ip)->i_dio_count)) + if (!sync && inode_is_open_for_write(VFS_I(ip))) return false; - - return true; + return xfs_can_free_cowblocks(ip); } /* @@ -1291,7 +1346,7 @@ xfs_inode_free_cowblocks( if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) return 0; - if (!xfs_prep_free_cowblocks(ip)) + if (!xfs_prep_free_cowblocks(ip, icw)) return 0; if (!xfs_icwalk_match(ip, icw)) @@ -1320,7 +1375,7 @@ xfs_inode_free_cowblocks( * Check again, nobody else should be able to dirty blocks or change * the reflink iflag now that we have the first two locks held. */ - if (xfs_prep_free_cowblocks(ip)) + if (xfs_prep_free_cowblocks(ip, icw)) ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); return ret; } @@ -1362,14 +1417,13 @@ void xfs_blockgc_start( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; if (xfs_set_blockgc_enabled(mp)) return; trace_xfs_blockgc_start(mp, __return_address); - for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) xfs_blockgc_queue(pag); } @@ -1485,21 +1539,19 @@ int xfs_blockgc_flush_all( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; trace_xfs_blockgc_flush_all(mp, __return_address); /* - * For each blockgc worker, move its queue time up to now. If it - * wasn't queued, it will not be requeued. Then flush whatever's - * left. + * For each blockgc worker, move its queue time up to now. If it wasn't + * queued, it will not be requeued. Then flush whatever is left. */ - for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) mod_delayed_work(pag->pag_mount->m_blockgc_wq, &pag->pag_blockgc_work, 0); - for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) flush_delayed_work(&pag->pag_blockgc_work); return xfs_inodegc_flush(mp); @@ -1745,12 +1797,11 @@ xfs_icwalk( enum xfs_icwalk_goal goal, struct xfs_icwalk *icw) { - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; int error = 0; int last_error = 0; - xfs_agnumber_t agno; - for_each_perag_tag(mp, agno, pag, goal) { + while ((pag = xfs_perag_grab_next_tag(mp, pag, goal))) { error = xfs_icwalk_ag(pag, goal, icw); if (error) { last_error = error; @@ -2167,8 +2218,7 @@ xfs_inodegc_shrinker_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_mount *mp = container_of(shrink, struct xfs_mount, - m_inodegc_shrinker); + struct xfs_mount *mp = shrink->private_data; struct xfs_inodegc *gc; int cpu; @@ -2189,8 +2239,7 @@ xfs_inodegc_shrinker_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_mount *mp = container_of(shrink, struct xfs_mount, - m_inodegc_shrinker); + struct xfs_mount *mp = shrink->private_data; struct xfs_inodegc *gc; int cpu; bool no_items = true; @@ -2226,13 +2275,19 @@ int xfs_inodegc_register_shrinker( struct xfs_mount *mp) { - struct shrinker *shrink = &mp->m_inodegc_shrinker; + mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB, + "xfs-inodegc:%s", + mp->m_super->s_id); + if (!mp->m_inodegc_shrinker) + return -ENOMEM; - shrink->count_objects = xfs_inodegc_shrinker_count; - shrink->scan_objects = xfs_inodegc_shrinker_scan; - shrink->seeks = 0; - shrink->flags = SHRINKER_NONSLAB; - shrink->batch = XFS_INODEGC_SHRINKER_BATCH; + mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count; + mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan; + mp->m_inodegc_shrinker->seeks = 0; + mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH; + mp->m_inodegc_shrinker->private_data = mp; - return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id); + shrinker_register(mp->m_inodegc_shrinker); + + return 0; } |