summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_icache.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_icache.c')
-rw-r--r--fs/xfs/xfs_icache.c735
1 files changed, 296 insertions, 439 deletions
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 8dc2e5414276..101028ebb571 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -22,6 +22,7 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_reflink.h"
+#include "xfs_ialloc.h"
#include <linux/iversion.h>
@@ -36,13 +37,11 @@ xfs_inode_alloc(
struct xfs_inode *ip;
/*
- * if this didn't occur in transactions, we could use
- * KM_MAYFAIL and return NULL here on ENOMEM. Set the
- * code up to do this anyway.
+ * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
+ * and return NULL here on ENOMEM.
*/
- ip = kmem_zone_alloc(xfs_inode_zone, 0);
- if (!ip)
- return NULL;
+ ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);
+
if (inode_init_always(mp->m_super, VFS_I(ip))) {
kmem_cache_free(xfs_inode_zone, ip);
return NULL;
@@ -62,8 +61,6 @@ xfs_inode_alloc(
memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
ip->i_afp = NULL;
ip->i_cowfp = NULL;
- ip->i_cnextents = 0;
- ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
memset(&ip->i_df, 0, sizeof(ip->i_df));
ip->i_flags = 0;
ip->i_delayed_blks = 0;
@@ -88,15 +85,18 @@ xfs_inode_free_callback(
case S_IFREG:
case S_IFDIR:
case S_IFLNK:
- xfs_idestroy_fork(ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(&ip->i_df);
break;
}
- if (ip->i_afp)
- xfs_idestroy_fork(ip, XFS_ATTR_FORK);
- if (ip->i_cowfp)
- xfs_idestroy_fork(ip, XFS_COW_FORK);
-
+ if (ip->i_afp) {
+ xfs_idestroy_fork(ip->i_afp);
+ kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ }
+ if (ip->i_cowfp) {
+ xfs_idestroy_fork(ip->i_cowfp);
+ kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
+ }
if (ip->i_itemp) {
ASSERT(!test_bit(XFS_LI_IN_AIL,
&ip->i_itemp->ili_item.li_flags));
@@ -113,6 +113,7 @@ __xfs_inode_free(
{
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
XFS_STATS_DEC(ip->i_mount, vn_active);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
@@ -139,11 +140,8 @@ xfs_inode_free(
}
/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
+ * Queue background inode reclaim work if there are reclaimable inodes and there
+ * isn't reclaim work already scheduled or in progress.
*/
static void
xfs_reclaim_work_queue(
@@ -158,24 +156,6 @@ xfs_reclaim_work_queue(
rcu_read_unlock();
}
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_reclaim_work);
-
- xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
- xfs_reclaim_work_queue(mp);
-}
-
static void
xfs_perag_set_reclaim_tag(
struct xfs_perag *pag)
@@ -289,6 +269,8 @@ xfs_reinit_inode(
uint64_t version = inode_peek_iversion(inode);
umode_t mode = inode->i_mode;
dev_t dev = inode->i_rdev;
+ kuid_t uid = inode->i_uid;
+ kgid_t gid = inode->i_gid;
error = inode_init_always(mp->m_super, inode);
@@ -297,6 +279,8 @@ xfs_reinit_inode(
inode_set_iversion_queried(inode, version);
inode->i_mode = mode;
inode->i_rdev = dev;
+ inode->i_uid = uid;
+ inode->i_gid = gid;
return error;
}
@@ -419,6 +403,7 @@ xfs_iget_cache_hit(
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
+ ASSERT(!rwsem_is_locked(&inode->i_rwsem));
error = xfs_reinit_inode(mp, inode);
if (error) {
bool wake;
@@ -452,9 +437,6 @@ xfs_iget_cache_hit(
ip->i_sick = 0;
ip->i_checked = 0;
- ASSERT(!rwsem_is_locked(&inode->i_rwsem));
- init_rwsem(&inode->i_rwsem);
-
spin_unlock(&ip->i_flags_lock);
spin_unlock(&pag->pag_ici_lock);
} else {
@@ -475,7 +457,7 @@ xfs_iget_cache_hit(
xfs_ilock(ip, lock_flags);
if (!(flags & XFS_IGET_INCORE))
- xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+ xfs_iflags_clear(ip, XFS_ISTALE);
XFS_STATS_INC(mp, xs_ig_found);
return 0;
@@ -506,18 +488,42 @@ xfs_iget_cache_miss(
if (!ip)
return -ENOMEM;
- error = xfs_iread(mp, tp, ip, flags);
+ error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
if (error)
goto out_destroy;
- if (!xfs_inode_verify_forks(ip)) {
- error = -EFSCORRUPTED;
- goto out_destroy;
+ /*
+ * For version 5 superblocks, if we are initialising a new inode and we
+ * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can
+ * simply build the new inode core with a random generation number.
+ *
+ * For version 4 (and older) superblocks, log recovery is dependent on
+ * the di_flushiter field being initialised from the current on-disk
+ * value and hence we must also read the inode off disk even when
+ * initializing new inodes.
+ */
+ if (xfs_sb_version_has_v3inode(&mp->m_sb) &&
+ (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ VFS_I(ip)->i_generation = prandom_u32();
+ } else {
+ struct xfs_dinode *dip;
+ struct xfs_buf *bp;
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0);
+ if (error)
+ goto out_destroy;
+
+ error = xfs_inode_from_disk(ip, dip);
+ if (!error)
+ xfs_buf_set_ref(bp, XFS_INO_REF);
+ xfs_trans_brelse(tp, bp);
+
+ if (error)
+ goto out_destroy;
}
trace_xfs_iget_miss(ip);
-
/*
* Check the inode free state is valid. This also detects lookup
* racing with unlinks.
@@ -557,7 +563,7 @@ xfs_iget_cache_miss(
*/
iflags = XFS_INEW;
if (flags & XFS_IGET_DONTCACHE)
- iflags |= XFS_IDONTCACHE;
+ d_mark_dontcache(VFS_I(ip));
ip->i_udquot = NULL;
ip->i_gdquot = NULL;
ip->i_pdquot = NULL;
@@ -590,48 +596,31 @@ out_destroy:
}
/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
+ * Look up an inode by number in the given file system. The inode is looked up
+ * in the cache held in each AG. If the inode is found in the cache, initialise
+ * the vfs inode if necessary.
*
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
+ * If it is not in core, read it in from the file system's device, add it to the
+ * cache and initialise the vfs inode.
*
* The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system. It points
- * to the inode hash table.
- * tp -- a pointer to the current transaction if there is one. This is
- * simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired. This is the unique identifier
- * within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode. See the comment
- * for xfs_ilock() for a list of valid values.
+ * Inode lookup is only done during metadata operations and not as part of the
+ * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
*/
int
xfs_iget(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- uint flags,
- uint lock_flags,
- xfs_inode_t **ipp)
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ uint flags,
+ uint lock_flags,
+ struct xfs_inode **ipp)
{
- xfs_inode_t *ip;
- int error;
- xfs_perag_t *pag;
- xfs_agino_t agino;
+ struct xfs_inode *ip;
+ struct xfs_perag *pag;
+ xfs_agino_t agino;
+ int error;
- /*
- * xfs_reclaim_inode() uses the ILOCK to ensure an inode
- * doesn't get freed while it's being referenced during a
- * radix tree traversal here. It assumes this function
- * aqcuires only the ILOCK (and therefore it has no need to
- * involve the IOLOCK in this synchronization).
- */
ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
/* reject inode numbers outside existing AGs */
@@ -733,25 +722,22 @@ xfs_icache_inode_is_allocated(
*/
#define XFS_LOOKUP_BATCH 32
-STATIC int
-xfs_inode_ag_walk_grab(
+/*
+ * Decide if the given @ip is eligible to be a part of the inode walk, and
+ * grab it if so. Returns true if it's ready to go or false if we should just
+ * ignore it.
+ */
+STATIC bool
+xfs_inode_walk_ag_grab(
struct xfs_inode *ip,
int flags)
{
struct inode *inode = VFS_I(ip);
- bool newinos = !!(flags & XFS_AGITER_INEW_WAIT);
+ bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT);
ASSERT(rcu_read_lock_held());
- /*
- * check for stale RCU freed inode
- *
- * If the inode has been reallocated, it doesn't matter if it's not in
- * the AG we are walking - we are walking for writeback, so if it
- * passes all the "valid inode" checks and is dirty, then we'll write
- * it back anyway. If it has been reallocated and still being
- * initialised, the XFS_INEW check below will catch it.
- */
+ /* Check for stale RCU freed inode */
spin_lock(&ip->i_flags_lock);
if (!ip->i_ino)
goto out_unlock_noent;
@@ -764,39 +750,41 @@ xfs_inode_ag_walk_grab(
/* nothing to sync during shutdown */
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EFSCORRUPTED;
+ return false;
/* If we can't grab the inode, it must on it's way to reclaim. */
if (!igrab(inode))
- return -ENOENT;
+ return false;
/* inode is valid */
- return 0;
+ return true;
out_unlock_noent:
spin_unlock(&ip->i_flags_lock);
- return -ENOENT;
+ return false;
}
+/*
+ * For a given per-AG structure @pag, grab, @execute, and rele all incore
+ * inodes with the given radix tree @tag.
+ */
STATIC int
-xfs_inode_ag_walk(
- struct xfs_mount *mp,
+xfs_inode_walk_ag(
struct xfs_perag *pag,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
+ int iter_flags,
+ int (*execute)(struct xfs_inode *ip, void *args),
void *args,
- int tag,
- int iter_flags)
+ int tag)
{
+ struct xfs_mount *mp = pag->pag_mount;
uint32_t first_index;
int last_error = 0;
int skipped;
- int done;
+ bool done;
int nr_found;
restart:
- done = 0;
+ done = false;
skipped = 0;
first_index = 0;
nr_found = 0;
@@ -807,7 +795,7 @@ restart:
rcu_read_lock();
- if (tag == -1)
+ if (tag == XFS_ICI_NO_TAG)
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)batch, first_index,
XFS_LOOKUP_BATCH);
@@ -829,7 +817,7 @@ restart:
for (i = 0; i < nr_found; i++) {
struct xfs_inode *ip = batch[i];
- if (done || xfs_inode_ag_walk_grab(ip, iter_flags))
+ if (done || !xfs_inode_walk_ag_grab(ip, iter_flags))
batch[i] = NULL;
/*
@@ -848,7 +836,7 @@ restart:
continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- done = 1;
+ done = true;
}
/* unlock now we've grabbed the inodes. */
@@ -857,10 +845,10 @@ restart:
for (i = 0; i < nr_found; i++) {
if (!batch[i])
continue;
- if ((iter_flags & XFS_AGITER_INEW_WAIT) &&
+ if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) &&
xfs_iflags_test(batch[i], XFS_INEW))
xfs_inew_wait(batch[i]);
- error = execute(batch[i], flags, args);
+ error = execute(batch[i], args);
xfs_irele(batch[i]);
if (error == -EAGAIN) {
skipped++;
@@ -885,6 +873,49 @@ restart:
return last_error;
}
+/* Fetch the next (possibly tagged) per-AG structure. */
+static inline struct xfs_perag *
+xfs_inode_walk_get_perag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ int tag)
+{
+ if (tag == XFS_ICI_NO_TAG)
+ return xfs_perag_get(mp, agno);
+ return xfs_perag_get_tag(mp, agno, tag);
+}
+
+/*
+ * Call the @execute function on all incore inodes matching the radix tree
+ * @tag.
+ */
+int
+xfs_inode_walk(
+ struct xfs_mount *mp,
+ int iter_flags,
+ int (*execute)(struct xfs_inode *ip, void *args),
+ void *args,
+ int tag)
+{
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t ag;
+
+ ag = 0;
+ while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) {
+ ag = pag->pag_agno + 1;
+ error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag);
+ xfs_perag_put(pag);
+ if (error) {
+ last_error = error;
+ if (error == -EFSCORRUPTED)
+ break;
+ }
+ }
+ return last_error;
+}
+
/*
* Background scanning to trim post-EOF preallocated space. This is queued
* based on the 'speculative_prealloc_lifetime' tunable (5m by default).
@@ -907,7 +938,12 @@ xfs_eofblocks_worker(
{
struct xfs_mount *mp = container_of(to_delayed_work(work),
struct xfs_mount, m_eofblocks_work);
+
+ if (!sb_start_write_trylock(mp->m_super))
+ return;
xfs_icache_free_eofblocks(mp, NULL);
+ sb_end_write(mp->m_super);
+
xfs_queue_eofblocks(mp);
}
@@ -934,235 +970,86 @@ xfs_cowblocks_worker(
{
struct xfs_mount *mp = container_of(to_delayed_work(work),
struct xfs_mount, m_cowblocks_work);
- xfs_icache_free_cowblocks(mp, NULL);
- xfs_queue_cowblocks(mp);
-}
-
-int
-xfs_inode_ag_iterator_flags(
- struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
- void *args,
- int iter_flags)
-{
- struct xfs_perag *pag;
- int error = 0;
- int last_error = 0;
- xfs_agnumber_t ag;
-
- ag = 0;
- while ((pag = xfs_perag_get(mp, ag))) {
- ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1,
- iter_flags);
- xfs_perag_put(pag);
- if (error) {
- last_error = error;
- if (error == -EFSCORRUPTED)
- break;
- }
- }
- return last_error;
-}
-int
-xfs_inode_ag_iterator(
- struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
- void *args)
-{
- return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0);
-}
-
-int
-xfs_inode_ag_iterator_tag(
- struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
- void *args,
- int tag)
-{
- struct xfs_perag *pag;
- int error = 0;
- int last_error = 0;
- xfs_agnumber_t ag;
+ if (!sb_start_write_trylock(mp->m_super))
+ return;
+ xfs_icache_free_cowblocks(mp, NULL);
+ sb_end_write(mp->m_super);
- ag = 0;
- while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
- ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag,
- 0);
- xfs_perag_put(pag);
- if (error) {
- last_error = error;
- if (error == -EFSCORRUPTED)
- break;
- }
- }
- return last_error;
+ xfs_queue_cowblocks(mp);
}
/*
* Grab the inode for reclaim exclusively.
- * Return 0 if we grabbed it, non-zero otherwise.
+ *
+ * We have found this inode via a lookup under RCU, so the inode may have
+ * already been freed, or it may be in the process of being recycled by
+ * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
+ * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
+ * will not be set. Hence we need to check for both these flag conditions to
+ * avoid inodes that are no longer reclaim candidates.
+ *
+ * Note: checking for other state flags here, under the i_flags_lock or not, is
+ * racy and should be avoided. Those races should be resolved only after we have
+ * ensured that we are able to reclaim this inode and the world can see that we
+ * are going to reclaim it.
+ *
+ * Return true if we grabbed it, false otherwise.
*/
-STATIC int
+static bool
xfs_reclaim_inode_grab(
- struct xfs_inode *ip,
- int flags)
+ struct xfs_inode *ip)
{
ASSERT(rcu_read_lock_held());
- /* quick check for stale RCU freed inode */
- if (!ip->i_ino)
- return 1;
-
- /*
- * If we are asked for non-blocking operation, do unlocked checks to
- * see if the inode already is being flushed or in reclaim to avoid
- * lock traffic.
- */
- if ((flags & SYNC_TRYLOCK) &&
- __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
- return 1;
-
- /*
- * The radix tree lock here protects a thread in xfs_iget from racing
- * with us starting reclaim on the inode. Once we have the
- * XFS_IRECLAIM flag set it will not touch us.
- *
- * Due to RCU lookup, we may find inodes that have been freed and only
- * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
- * aren't candidates for reclaim at all, so we must check the
- * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
- */
spin_lock(&ip->i_flags_lock);
if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
__xfs_iflags_test(ip, XFS_IRECLAIM)) {
/* not a reclaim candidate. */
spin_unlock(&ip->i_flags_lock);
- return 1;
+ return false;
}
__xfs_iflags_set(ip, XFS_IRECLAIM);
spin_unlock(&ip->i_flags_lock);
- return 0;
+ return true;
}
/*
- * Inodes in different states need to be treated differently. The following
- * table lists the inode states and the reclaim actions necessary:
- *
- * inode state iflush ret required action
- * --------------- ---------- ---------------
- * bad - reclaim
- * shutdown EIO unpin and reclaim
- * clean, unpinned 0 reclaim
- * stale, unpinned 0 reclaim
- * clean, pinned(*) 0 requeue
- * stale, pinned EAGAIN requeue
- * dirty, async - requeue
- * dirty, sync 0 reclaim
- *
- * (*) dgc: I don't think the clean, pinned state is possible but it gets
- * handled anyway given the order of checks implemented.
- *
- * Also, because we get the flush lock first, we know that any inode that has
- * been flushed delwri has had the flush completed by the time we check that
- * the inode is clean.
- *
- * Note that because the inode is flushed delayed write by AIL pushing, the
- * flush lock may already be held here and waiting on it can result in very
- * long latencies. Hence for sync reclaims, where we wait on the flush lock,
- * the caller should push the AIL first before trying to reclaim inodes to
- * minimise the amount of time spent waiting. For background relaim, we only
- * bother to reclaim clean inodes anyway.
+ * Inode reclaim is non-blocking, so the default action if progress cannot be
+ * made is to "requeue" the inode for reclaim by unlocking it and clearing the
+ * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about
+ * blocking anymore and hence we can wait for the inode to be able to reclaim
+ * it.
*
- * Hence the order of actions after gaining the locks should be:
- * bad => reclaim
- * shutdown => unpin and reclaim
- * pinned, async => requeue
- * pinned, sync => unpin
- * stale => reclaim
- * clean => reclaim
- * dirty, async => requeue
- * dirty, sync => flush, wait and reclaim
+ * We do no IO here - if callers require inodes to be cleaned they must push the
+ * AIL first to trigger writeback of dirty inodes. This enables writeback to be
+ * done in the background in a non-blocking manner, and enables memory reclaim
+ * to make progress without blocking.
*/
-STATIC int
+static void
xfs_reclaim_inode(
struct xfs_inode *ip,
- struct xfs_perag *pag,
- int sync_mode)
+ struct xfs_perag *pag)
{
- struct xfs_buf *bp = NULL;
xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */
- int error;
-restart:
- error = 0;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (!xfs_iflock_nowait(ip)) {
- if (!(sync_mode & SYNC_WAIT))
- goto out;
- xfs_iflock(ip);
- }
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ goto out;
+ if (!xfs_iflock_nowait(ip))
+ goto out_iunlock;
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_iunpin_wait(ip);
/* xfs_iflush_abort() drops the flush lock */
- xfs_iflush_abort(ip, false);
+ xfs_iflush_abort(ip);
goto reclaim;
}
- if (xfs_ipincount(ip)) {
- if (!(sync_mode & SYNC_WAIT))
- goto out_ifunlock;
- xfs_iunpin_wait(ip);
- }
- if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- goto reclaim;
- }
-
- /*
- * Never flush out dirty data during non-blocking reclaim, as it would
- * just contend with AIL pushing trying to do the same job.
- */
- if (!(sync_mode & SYNC_WAIT))
+ if (xfs_ipincount(ip))
+ goto out_ifunlock;
+ if (!xfs_inode_clean(ip))
goto out_ifunlock;
- /*
- * Now we have an inode that needs flushing.
- *
- * Note that xfs_iflush will never block on the inode buffer lock, as
- * xfs_ifree_cluster() can lock the inode buffer before it locks the
- * ip->i_lock, and we are doing the exact opposite here. As a result,
- * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
- * result in an ABBA deadlock with xfs_ifree_cluster().
- *
- * As xfs_ifree_cluser() must gather all inodes that are active in the
- * cache to mark them stale, if we hit this case we don't actually want
- * to do IO here - we want the inode marked stale so we can simply
- * reclaim it. Hence if we get an EAGAIN error here, just unlock the
- * inode, back off and try again. Hopefully the next pass through will
- * see the stale flag set on the inode.
- */
- error = xfs_iflush(ip, &bp);
- if (error == -EAGAIN) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /* backoff longer than in xfs_ifree_cluster */
- delay(2);
- goto restart;
- }
-
- if (!error) {
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- }
-
+ xfs_ifunlock(ip);
reclaim:
ASSERT(!xfs_isiflocked(ip));
@@ -1209,23 +1096,17 @@ reclaim:
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_qm_dqdetach(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ ASSERT(xfs_inode_clean(ip));
__xfs_inode_free(ip);
- return error;
+ return;
out_ifunlock:
xfs_ifunlock(ip);
+out_iunlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
xfs_iflags_clear(ip, XFS_IRECLAIM);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /*
- * We could return -EAGAIN here to make reclaim rescan the inode tree in
- * a short while. However, this just burns CPU time scanning the tree
- * waiting for IO to complete and the reclaim work never goes back to
- * the idle state. Instead, return 0 to let the next scheduled
- * background reclaim attempt to reclaim the inode again.
- */
- return 0;
}
/*
@@ -1233,23 +1114,19 @@ out:
* corrupted, we still want to try to reclaim all the inodes. If we don't,
* then a shut down during filesystem unmount reclaim walk leak all the
* unreclaimed inodes.
+ *
+ * Returns non-zero if any AGs or inodes were skipped in the reclaim pass
+ * so that callers that want to block until all dirty inodes are written back
+ * and reclaimed can sanely loop.
*/
-STATIC int
+static void
xfs_reclaim_inodes_ag(
struct xfs_mount *mp,
- int flags,
int *nr_to_scan)
{
struct xfs_perag *pag;
- int error = 0;
- int last_error = 0;
- xfs_agnumber_t ag;
- int trylock = flags & SYNC_TRYLOCK;
- int skipped;
+ xfs_agnumber_t ag = 0;
-restart:
- ag = 0;
- skipped = 0;
while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
unsigned long first_index = 0;
int done = 0;
@@ -1257,16 +1134,7 @@ restart:
ag = pag->pag_agno + 1;
- if (trylock) {
- if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
- skipped++;
- xfs_perag_put(pag);
- continue;
- }
- first_index = pag->pag_ici_reclaim_cursor;
- } else
- mutex_lock(&pag->pag_ici_reclaim_lock);
-
+ first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
do {
struct xfs_inode *batch[XFS_LOOKUP_BATCH];
int i;
@@ -1290,7 +1158,7 @@ restart:
for (i = 0; i < nr_found; i++) {
struct xfs_inode *ip = batch[i];
- if (done || xfs_reclaim_inode_grab(ip, flags))
+ if (done || !xfs_reclaim_inode_grab(ip))
batch[i] = NULL;
/*
@@ -1319,59 +1187,39 @@ restart:
rcu_read_unlock();
for (i = 0; i < nr_found; i++) {
- if (!batch[i])
- continue;
- error = xfs_reclaim_inode(batch[i], pag, flags);
- if (error && last_error != -EFSCORRUPTED)
- last_error = error;
+ if (batch[i])
+ xfs_reclaim_inode(batch[i], pag);
}
*nr_to_scan -= XFS_LOOKUP_BATCH;
-
cond_resched();
-
} while (nr_found && !done && *nr_to_scan > 0);
- if (trylock && !done)
- pag->pag_ici_reclaim_cursor = first_index;
- else
- pag->pag_ici_reclaim_cursor = 0;
- mutex_unlock(&pag->pag_ici_reclaim_lock);
+ if (done)
+ first_index = 0;
+ WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
xfs_perag_put(pag);
}
-
- /*
- * if we skipped any AG, and we still have scan count remaining, do
- * another pass this time using blocking reclaim semantics (i.e
- * waiting on the reclaim locks and ignoring the reclaim cursors). This
- * ensure that when we get more reclaimers than AGs we block rather
- * than spin trying to execute reclaim.
- */
- if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
- trylock = 0;
- goto restart;
- }
- return last_error;
}
-int
+void
xfs_reclaim_inodes(
- xfs_mount_t *mp,
- int mode)
+ struct xfs_mount *mp)
{
int nr_to_scan = INT_MAX;
- return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+ while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ xfs_ail_push_all_sync(mp->m_ail);
+ xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+ };
}
/*
- * Scan a certain number of inodes for reclaim.
- *
- * When called we make sure that there is a background (fast) inode reclaim in
- * progress, while we will throttle the speed of reclaim via doing synchronous
- * reclaim of inodes. That means if we come across dirty inodes, we wait for
- * them to be cleaned, which we hope will not be very long due to the
- * background walker having already kicked the IO off on those dirty inodes.
+ * The shrinker infrastructure determines how many inodes we should scan for
+ * reclaim. We want as many clean inodes ready to reclaim as possible, so we
+ * push the AIL here. We also want to proactively free up memory if we can to
+ * minimise the amount of work memory reclaim has to do so we kick the
+ * background reclaim if it isn't already scheduled.
*/
long
xfs_reclaim_inodes_nr(
@@ -1382,7 +1230,8 @@ xfs_reclaim_inodes_nr(
xfs_reclaim_work_queue(mp);
xfs_ail_push_all(mp->m_ail);
- return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+ xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+ return 0;
}
/*
@@ -1405,59 +1254,108 @@ xfs_reclaim_inodes_count(
return reclaimable;
}
-STATIC int
+STATIC bool
xfs_inode_match_id(
struct xfs_inode *ip,
struct xfs_eofblocks *eofb)
{
if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
!uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
- return 0;
+ return false;
if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
!gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
- return 0;
+ return false;
if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
ip->i_d.di_projid != eofb->eof_prid)
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
* A union-based inode filtering algorithm. Process the inode if any of the
* criteria match. This is for global/internal scans only.
*/
-STATIC int
+STATIC bool
xfs_inode_match_id_union(
struct xfs_inode *ip,
struct xfs_eofblocks *eofb)
{
if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
- return 1;
+ return true;
if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
- return 1;
+ return true;
if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
ip->i_d.di_projid == eofb->eof_prid)
- return 1;
+ return true;
- return 0;
+ return false;
+}
+
+/*
+ * Is this inode @ip eligible for eof/cow block reclamation, given some
+ * filtering parameters @eofb? The inode is eligible if @eofb is null or
+ * if the predicate functions match.
+ */
+static bool
+xfs_inode_matches_eofb(
+ struct xfs_inode *ip,
+ struct xfs_eofblocks *eofb)
+{
+ bool match;
+
+ if (!eofb)
+ return true;
+
+ if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
+ match = xfs_inode_match_id_union(ip, eofb);
+ else
+ match = xfs_inode_match_id(ip, eofb);
+ if (!match)
+ return false;
+
+ /* skip the inode if the file size is too small */
+ if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) &&
+ XFS_ISIZE(ip) < eofb->eof_min_file_size)
+ return false;
+
+ return true;
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low.
+ */
+void
+xfs_reclaim_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_reclaim_work);
+ int nr_to_scan = INT_MAX;
+
+ xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+ xfs_reclaim_work_queue(mp);
}
STATIC int
xfs_inode_free_eofblocks(
struct xfs_inode *ip,
- int flags,
void *args)
{
- int ret = 0;
- struct xfs_eofblocks *eofb = args;
- int match;
+ struct xfs_eofblocks *eofb = args;
+ bool wait;
+ int ret;
+
+ wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
if (!xfs_can_free_eofblocks(ip, false)) {
/* inode could be preallocated or append-only */
@@ -1470,62 +1368,34 @@ xfs_inode_free_eofblocks(
* If the mapping is dirty the operation can block and wait for some
* time. Unless we are waiting, skip it.
*/
- if (!(flags & SYNC_WAIT) &&
- mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+ if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
return 0;
- if (eofb) {
- if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
- match = xfs_inode_match_id_union(ip, eofb);
- else
- match = xfs_inode_match_id(ip, eofb);
- if (!match)
- return 0;
-
- /* skip the inode if the file size is too small */
- if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
- XFS_ISIZE(ip) < eofb->eof_min_file_size)
- return 0;
- }
+ if (!xfs_inode_matches_eofb(ip, eofb))
+ return 0;
/*
* If the caller is waiting, return -EAGAIN to keep the background
* scanner moving and revisit the inode in a subsequent pass.
*/
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
- if (flags & SYNC_WAIT)
- ret = -EAGAIN;
- return ret;
+ if (wait)
+ return -EAGAIN;
+ return 0;
}
+
ret = xfs_free_eofblocks(ip);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
-static int
-__xfs_icache_free_eofblocks(
- struct xfs_mount *mp,
- struct xfs_eofblocks *eofb,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int tag)
-{
- int flags = SYNC_TRYLOCK;
-
- if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
- flags = SYNC_WAIT;
-
- return xfs_inode_ag_iterator_tag(mp, execute, flags,
- eofb, tag);
-}
-
int
xfs_icache_free_eofblocks(
struct xfs_mount *mp,
struct xfs_eofblocks *eofb)
{
- return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks,
+ return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb,
XFS_ICI_EOFBLOCKS_TAG);
}
@@ -1552,7 +1422,7 @@ __xfs_inode_free_quota_eofblocks(
eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
- dq = xfs_inode_dquot(ip, XFS_DQ_USER);
+ dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER);
if (dq && xfs_dquot_lowsp(dq)) {
eofb.eof_uid = VFS_I(ip)->i_uid;
eofb.eof_flags |= XFS_EOF_FLAGS_UID;
@@ -1561,7 +1431,7 @@ __xfs_inode_free_quota_eofblocks(
}
if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
- dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
+ dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP);
if (dq && xfs_dquot_lowsp(dq)) {
eofb.eof_gid = VFS_I(ip)->i_gid;
eofb.eof_flags |= XFS_EOF_FLAGS_GID;
@@ -1742,29 +1612,16 @@ xfs_prep_free_cowblocks(
STATIC int
xfs_inode_free_cowblocks(
struct xfs_inode *ip,
- int flags,
void *args)
{
struct xfs_eofblocks *eofb = args;
- int match;
int ret = 0;
if (!xfs_prep_free_cowblocks(ip))
return 0;
- if (eofb) {
- if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
- match = xfs_inode_match_id_union(ip, eofb);
- else
- match = xfs_inode_match_id(ip, eofb);
- if (!match)
- return 0;
-
- /* skip the inode if the file size is too small */
- if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
- XFS_ISIZE(ip) < eofb->eof_min_file_size)
- return 0;
- }
+ if (!xfs_inode_matches_eofb(ip, eofb))
+ return 0;
/* Free the CoW blocks */
xfs_ilock(ip, XFS_IOLOCK_EXCL);
@@ -1788,7 +1645,7 @@ xfs_icache_free_cowblocks(
struct xfs_mount *mp,
struct xfs_eofblocks *eofb)
{
- return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks,
+ return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb,
XFS_ICI_COWBLOCKS_TAG);
}