summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_icache.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_icache.c')
-rw-r--r--fs/xfs/xfs_icache.c181
1 files changed, 118 insertions, 63 deletions
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 57a9f2317525..6b119a7a324f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -24,6 +24,7 @@
#include "xfs_ialloc.h"
#include "xfs_ag.h"
#include "xfs_log_priv.h"
+#include "xfs_health.h"
#include <linux/iversion.h>
@@ -64,6 +65,18 @@ static int xfs_icwalk_ag(struct xfs_perag *pag,
XFS_ICWALK_FLAG_RECLAIM_SICK | \
XFS_ICWALK_FLAG_UNION)
+/* Marks for the perag xarray */
+#define XFS_PERAG_RECLAIM_MARK XA_MARK_0
+#define XFS_PERAG_BLOCKGC_MARK XA_MARK_1
+
+static inline xa_mark_t ici_tag_to_mark(unsigned int tag)
+{
+ if (tag == XFS_ICI_RECLAIM_TAG)
+ return XFS_PERAG_RECLAIM_MARK;
+ ASSERT(tag == XFS_ICI_BLOCKGC_TAG);
+ return XFS_PERAG_BLOCKGC_MARK;
+}
+
/*
* Allocate and initialise an xfs_inode.
*/
@@ -85,10 +98,10 @@ xfs_inode_alloc(
return NULL;
}
- /* VFS doesn't initialise i_mode or i_state! */
+ /* VFS doesn't initialise i_mode! */
VFS_I(ip)->i_mode = 0;
- VFS_I(ip)->i_state = 0;
- mapping_set_large_folios(VFS_I(ip)->i_mapping);
+ mapping_set_folio_min_order(VFS_I(ip)->i_mapping,
+ M_IGEO(mp)->min_folio_order);
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -191,7 +204,7 @@ xfs_reclaim_work_queue(
{
rcu_read_lock();
- if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ if (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
}
@@ -241,9 +254,7 @@ xfs_perag_set_inode_tag(
return;
/* propagate the tag up into the perag radix tree */
- spin_lock(&mp->m_perag_lock);
- radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
- spin_unlock(&mp->m_perag_lock);
+ xa_set_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
/* start background work */
switch (tag) {
@@ -285,14 +296,39 @@ xfs_perag_clear_inode_tag(
return;
/* clear the tag from the perag radix tree */
- spin_lock(&mp->m_perag_lock);
- radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
- spin_unlock(&mp->m_perag_lock);
+ xa_clear_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
}
/*
+ * Find the next AG after @pag, or the first AG if @pag is NULL.
+ */
+static struct xfs_perag *
+xfs_perag_grab_next_tag(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ int tag)
+{
+ unsigned long index = 0;
+
+ if (pag) {
+ index = pag->pag_agno + 1;
+ xfs_perag_rele(pag);
+ }
+
+ rcu_read_lock();
+ pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag));
+ if (pag) {
+ trace_xfs_perag_grab_next_tag(pag, _RET_IP_);
+ if (!atomic_inc_not_zero(&pag->pag_active_ref))
+ pag = NULL;
+ }
+ rcu_read_unlock();
+ return pag;
+}
+
+/*
* When we recycle a reclaimable inode, we need to re-initialise the VFS inode
* part of the structure. This is made more complex by the fact we store
* information about the on-disk values in the VFS inode and so we can't just
@@ -313,6 +349,7 @@ xfs_reinit_inode(
dev_t dev = inode->i_rdev;
kuid_t uid = inode->i_uid;
kgid_t gid = inode->i_gid;
+ unsigned long state = inode->i_state;
error = inode_init_always(mp->m_super, inode);
@@ -323,7 +360,9 @@ xfs_reinit_inode(
inode->i_rdev = dev;
inode->i_uid = uid;
inode->i_gid = gid;
- mapping_set_large_folios(inode->i_mapping);
+ inode->i_state = state;
+ mapping_set_folio_min_order(inode->i_mapping,
+ M_IGEO(mp)->min_folio_order);
return error;
}
@@ -415,6 +454,9 @@ xfs_iget_check_free_state(
xfs_warn(ip->i_mount,
"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
ip->i_ino, VFS_I(ip)->i_mode);
+ xfs_agno_mark_sick(ip->i_mount,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_SICK_AG_INOBT);
return -EFSCORRUPTED;
}
@@ -422,6 +464,9 @@ xfs_iget_check_free_state(
xfs_warn(ip->i_mount,
"Corruption detected! Free inode 0x%llx has blocks allocated!",
ip->i_ino);
+ xfs_agno_mark_sick(ip->i_mount,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_SICK_AG_INOBT);
return -EFSCORRUPTED;
}
return 0;
@@ -606,7 +651,6 @@ xfs_iget_cache_miss(
struct xfs_inode *ip;
int error;
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
- int iflags;
ip = xfs_inode_alloc(mp, ino);
if (!ip)
@@ -640,6 +684,8 @@ xfs_iget_cache_miss(
xfs_buf_offset(bp, ip->i_imap.im_boffset));
if (!error)
xfs_buf_set_ref(bp, XFS_INO_REF);
+ else
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
xfs_trans_brelse(tp, bp);
if (error)
@@ -659,10 +705,9 @@ xfs_iget_cache_miss(
/*
* Preload the radix tree so we can insert safely under the
* write spinlock. Note that we cannot sleep inside the preload
- * region. Since we can be called from transaction context, don't
- * recurse into the file system.
+ * region.
*/
- if (radix_tree_preload(GFP_NOFS)) {
+ if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) {
error = -EAGAIN;
goto out_destroy;
}
@@ -685,13 +730,12 @@ xfs_iget_cache_miss(
* memory barrier that ensures this detection works correctly at lookup
* time.
*/
- iflags = XFS_INEW;
if (flags & XFS_IGET_DONTCACHE)
d_mark_dontcache(VFS_I(ip));
ip->i_udquot = NULL;
ip->i_gdquot = NULL;
ip->i_pdquot = NULL;
- xfs_iflags_set(ip, iflags);
+ xfs_iflags_set(ip, XFS_INEW);
/* insert the new inode */
spin_lock(&pag->pag_ici_lock);
@@ -748,7 +792,7 @@ xfs_iget(
ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
/* reject inode numbers outside existing AGs */
- if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+ if (!xfs_verify_ino(mp, ino))
return -EINVAL;
XFS_STATS_INC(mp, xs_ig_attempts);
@@ -970,7 +1014,7 @@ xfs_reclaim_inodes(
if (xfs_want_reclaim_sick(mp))
icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
- while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ while (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
xfs_ail_push_all_sync(mp->m_ail);
xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
}
@@ -1012,15 +1056,17 @@ long
xfs_reclaim_inodes_count(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t ag = 0;
+ XA_STATE (xas, &mp->m_perags, 0);
long reclaimable = 0;
+ struct xfs_perag *pag;
- while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
- ag = pag->pag_agno + 1;
+ rcu_read_lock();
+ xas_for_each_marked(&xas, pag, ULONG_MAX, XFS_PERAG_RECLAIM_MARK) {
+ trace_xfs_reclaim_inodes_count(pag, _THIS_IP_);
reclaimable += pag->pag_ici_reclaimable;
- xfs_perag_put(pag);
}
+ rcu_read_unlock();
+
return reclaimable;
}
@@ -1152,7 +1198,7 @@ xfs_inode_free_eofblocks(
if (xfs_can_free_eofblocks(ip))
return xfs_free_eofblocks(ip);
- /* inode could be preallocated or append-only */
+ /* inode could be preallocated */
trace_xfs_inode_free_eofblocks_invalid(ip);
xfs_inode_clear_eofblocks_tag(ip);
return 0;
@@ -1234,14 +1280,17 @@ xfs_inode_clear_eofblocks_tag(
}
/*
- * Set ourselves up to free CoW blocks from this file. If it's already clean
- * then we can bail out quickly, but otherwise we must back off if the file
- * is undergoing some kind of write.
+ * Prepare to free COW fork blocks from an inode.
*/
static bool
xfs_prep_free_cowblocks(
- struct xfs_inode *ip)
+ struct xfs_inode *ip,
+ struct xfs_icwalk *icw)
{
+ bool sync;
+
+ sync = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
+
/*
* Just clear the tag if we have an empty cow fork or none at all. It's
* possible the inode was fully unshared since it was originally tagged.
@@ -1253,16 +1302,22 @@ xfs_prep_free_cowblocks(
}
/*
- * If the mapping is dirty or under writeback we cannot touch the
- * CoW fork. Leave it alone if we're in the midst of a directio.
+ * A cowblocks trim of an inode can have a significant effect on
+ * fragmentation even when a reasonable COW extent size hint is set.
+ * Therefore, we prefer to not process cowblocks unless they are clean
+ * and idle. We can never process a cowblocks inode that is dirty or has
+ * in-flight I/O under any circumstances, because outstanding writeback
+ * or dio expects targeted COW fork blocks exist through write
+ * completion where they can be remapped into the data fork.
+ *
+ * Therefore, the heuristic used here is to never process inodes
+ * currently opened for write from background (i.e. non-sync) scans. For
+ * sync scans, use the pagecache/dio state of the inode to ensure we
+ * never free COW fork blocks out from under pending I/O.
*/
- if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
- mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
- mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
- atomic_read(&VFS_I(ip)->i_dio_count))
+ if (!sync && inode_is_open_for_write(VFS_I(ip)))
return false;
-
- return true;
+ return xfs_can_free_cowblocks(ip);
}
/*
@@ -1291,7 +1346,7 @@ xfs_inode_free_cowblocks(
if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
return 0;
- if (!xfs_prep_free_cowblocks(ip))
+ if (!xfs_prep_free_cowblocks(ip, icw))
return 0;
if (!xfs_icwalk_match(ip, icw))
@@ -1320,7 +1375,7 @@ xfs_inode_free_cowblocks(
* Check again, nobody else should be able to dirty blocks or change
* the reflink iflag now that we have the first two locks held.
*/
- if (xfs_prep_free_cowblocks(ip))
+ if (xfs_prep_free_cowblocks(ip, icw))
ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
return ret;
}
@@ -1362,14 +1417,13 @@ void
xfs_blockgc_start(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
if (xfs_set_blockgc_enabled(mp))
return;
trace_xfs_blockgc_start(mp, __return_address);
- for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
xfs_blockgc_queue(pag);
}
@@ -1485,21 +1539,19 @@ int
xfs_blockgc_flush_all(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
trace_xfs_blockgc_flush_all(mp, __return_address);
/*
- * For each blockgc worker, move its queue time up to now. If it
- * wasn't queued, it will not be requeued. Then flush whatever's
- * left.
+ * For each blockgc worker, move its queue time up to now. If it wasn't
+ * queued, it will not be requeued. Then flush whatever is left.
*/
- for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
mod_delayed_work(pag->pag_mount->m_blockgc_wq,
&pag->pag_blockgc_work, 0);
- for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
flush_delayed_work(&pag->pag_blockgc_work);
return xfs_inodegc_flush(mp);
@@ -1745,12 +1797,11 @@ xfs_icwalk(
enum xfs_icwalk_goal goal,
struct xfs_icwalk *icw)
{
- struct xfs_perag *pag;
+ struct xfs_perag *pag = NULL;
int error = 0;
int last_error = 0;
- xfs_agnumber_t agno;
- for_each_perag_tag(mp, agno, pag, goal) {
+ while ((pag = xfs_perag_grab_next_tag(mp, pag, goal))) {
error = xfs_icwalk_ag(pag, goal, icw);
if (error) {
last_error = error;
@@ -2167,8 +2218,7 @@ xfs_inodegc_shrinker_count(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_mount *mp = container_of(shrink, struct xfs_mount,
- m_inodegc_shrinker);
+ struct xfs_mount *mp = shrink->private_data;
struct xfs_inodegc *gc;
int cpu;
@@ -2189,8 +2239,7 @@ xfs_inodegc_shrinker_scan(
struct shrinker *shrink,
struct shrink_control *sc)
{
- struct xfs_mount *mp = container_of(shrink, struct xfs_mount,
- m_inodegc_shrinker);
+ struct xfs_mount *mp = shrink->private_data;
struct xfs_inodegc *gc;
int cpu;
bool no_items = true;
@@ -2226,13 +2275,19 @@ int
xfs_inodegc_register_shrinker(
struct xfs_mount *mp)
{
- struct shrinker *shrink = &mp->m_inodegc_shrinker;
+ mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB,
+ "xfs-inodegc:%s",
+ mp->m_super->s_id);
+ if (!mp->m_inodegc_shrinker)
+ return -ENOMEM;
- shrink->count_objects = xfs_inodegc_shrinker_count;
- shrink->scan_objects = xfs_inodegc_shrinker_scan;
- shrink->seeks = 0;
- shrink->flags = SHRINKER_NONSLAB;
- shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
+ mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count;
+ mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan;
+ mp->m_inodegc_shrinker->seeks = 0;
+ mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH;
+ mp->m_inodegc_shrinker->private_data = mp;
- return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
+ shrinker_register(mp->m_inodegc_shrinker);
+
+ return 0;
}