summaryrefslogtreecommitdiff
path: root/fs/btrfs/extent_map.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent_map.c')
-rw-r--r--fs/btrfs/extent_map.c132
1 files changed, 102 insertions, 30 deletions
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1d93e1202c33..36af9aa9aab1 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1128,11 +1128,14 @@ struct btrfs_em_shrink_ctx {
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
- const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
long nr_dropped = 0;
struct rb_node *node;
+ lockdep_assert_held_write(&tree->lock);
+
/*
* Take the mmap lock so that we serialize with the inode logging phase
* of fsync because we may need to set the full sync flag on the inode,
@@ -1144,28 +1147,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
* to find new extents, which may not be there yet because ordered
* extents haven't completed yet.
*
- * We also do a try lock because otherwise we could deadlock. This is
- * because the shrinker for this filesystem may be invoked while we are
- * in a path that is holding the mmap lock in write mode. For example in
- * a reflink operation while COWing an extent buffer, when allocating
- * pages for a new extent buffer and under memory pressure, the shrinker
- * may be invoked, and therefore we would deadlock by attempting to read
- * lock the mmap lock while we are holding already a write lock on it.
+ * We also do a try lock because we don't want to block for too long and
+ * we are holding the extent map tree's lock in write mode.
*/
if (!down_read_trylock(&inode->i_mmap_lock))
return 0;
- /*
- * We want to be fast so if the lock is busy we don't want to spend time
- * waiting for it - either some task is about to do IO for the inode or
- * we may have another task shrinking extent maps, here in this code, so
- * skip this inode.
- */
- if (!write_trylock(&tree->lock)) {
- up_read(&inode->i_mmap_lock);
- return 0;
- }
-
node = rb_first(&tree->root);
while (node) {
struct rb_node *next = rb_next(node);
@@ -1201,36 +1188,89 @@ next:
* lock. This is to avoid slowing other tasks trying to take the
* lock.
*/
- if (need_resched() || rwlock_needbreak(&tree->lock))
+ if (need_resched() || rwlock_needbreak(&tree->lock) ||
+ btrfs_fs_closing(fs_info))
break;
node = next;
}
- write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
return nr_dropped;
}
+static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
+ u64 min_ino)
+{
+ struct btrfs_inode *inode;
+ unsigned long from = min_ino;
+
+ xa_lock(&root->inodes);
+ while (true) {
+ struct extent_map_tree *tree;
+
+ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+ if (!inode)
+ break;
+
+ tree = &inode->extent_tree;
+
+ /*
+ * We want to be fast so if the lock is busy we don't want to
+ * spend time waiting for it (some task is about to do IO for
+ * the inode).
+ */
+ if (!write_trylock(&tree->lock))
+ goto next;
+
+ /*
+ * Skip inode if it doesn't have loaded extent maps, so we avoid
+ * getting a reference and doing an iput later. This includes
+ * cases like files that were opened for things like stat(2), or
+ * files with all extent maps previously released through the
+ * release folio callback (btrfs_release_folio()) or released in
+ * a previous run, or directories which never have extent maps.
+ */
+ if (RB_EMPTY_ROOT(&tree->root)) {
+ write_unlock(&tree->lock);
+ goto next;
+ }
+
+ if (igrab(&inode->vfs_inode))
+ break;
+
+ write_unlock(&tree->lock);
+next:
+ from = btrfs_ino(inode) + 1;
+ cond_resched_lock(&root->inodes.xa_lock);
+ }
+ xa_unlock(&root->inodes);
+
+ return inode;
+}
+
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode;
long nr_dropped = 0;
u64 min_ino = ctx->last_ino + 1;
- inode = btrfs_find_first_inode(root, min_ino);
+ inode = find_first_inode_to_shrink(root, min_ino);
while (inode) {
nr_dropped += btrfs_scan_inode(inode, ctx);
+ write_unlock(&inode->extent_tree.lock);
min_ino = btrfs_ino(inode) + 1;
ctx->last_ino = btrfs_ino(inode);
- btrfs_add_delayed_iput(inode);
+ iput(&inode->vfs_inode);
- if (ctx->scanned >= ctx->nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan ||
+ btrfs_fs_closing(fs_info))
break;
cond_resched();
- inode = btrfs_find_first_inode(root, min_ino);
+ inode = find_first_inode_to_shrink(root, min_ino);
}
if (inode) {
@@ -1254,16 +1294,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
return nr_dropped;
}
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
{
+ struct btrfs_fs_info *fs_info;
struct btrfs_em_shrink_ctx ctx;
u64 start_root_id;
u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
+ fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
+
ctx.scanned = 0;
- ctx.nr_to_scan = nr_to_scan;
+ ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
/*
* In case we have multiple tasks running this shrinker, make the next
@@ -1281,12 +1324,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
nr, ctx.last_root,
ctx.last_ino);
}
- while (ctx.scanned < ctx.nr_to_scan) {
+ while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
struct btrfs_root *root;
unsigned long count;
@@ -1344,5 +1387,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
ctx.last_ino);
}
- return nr_dropped;
+ atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+}
+
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+ /*
+ * Do nothing if the shrinker is already running. In case of high memory
+ * pressure we can have a lot of tasks calling us and all passing the
+ * same nr_to_scan value, but in reality we may need only to free
+ * nr_to_scan extent maps (or less). In case we need to free more than
+ * that, we will be called again by the fs shrinker, so no worries about
+ * not doing enough work to reclaim memory from extent maps.
+ * We can also be repeatedly called with the same nr_to_scan value
+ * simply because the shrinker runs asynchronously and multiple calls
+ * to this function are made before the shrinker does enough progress.
+ *
+ * That's why we set the atomic counter to nr_to_scan only if its
+ * current value is zero, instead of incrementing the counter by
+ * nr_to_scan.
+ */
+ if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
+ return;
+
+ queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
+}
+
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
+{
+ atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+ INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
}