diff options
Diffstat (limited to 'fs/btrfs/extent_map.c')
-rw-r--r-- | fs/btrfs/extent_map.c | 132 |
1 files changed, 102 insertions, 30 deletions
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1d93e1202c33..36af9aa9aab1 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1128,11 +1128,14 @@ struct btrfs_em_shrink_ctx { static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) { - const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); struct extent_map_tree *tree = &inode->extent_tree; long nr_dropped = 0; struct rb_node *node; + lockdep_assert_held_write(&tree->lock); + /* * Take the mmap lock so that we serialize with the inode logging phase * of fsync because we may need to set the full sync flag on the inode, @@ -1144,28 +1147,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c * to find new extents, which may not be there yet because ordered * extents haven't completed yet. * - * We also do a try lock because otherwise we could deadlock. This is - * because the shrinker for this filesystem may be invoked while we are - * in a path that is holding the mmap lock in write mode. For example in - * a reflink operation while COWing an extent buffer, when allocating - * pages for a new extent buffer and under memory pressure, the shrinker - * may be invoked, and therefore we would deadlock by attempting to read - * lock the mmap lock while we are holding already a write lock on it. + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. */ if (!down_read_trylock(&inode->i_mmap_lock)) return 0; - /* - * We want to be fast so if the lock is busy we don't want to spend time - * waiting for it - either some task is about to do IO for the inode or - * we may have another task shrinking extent maps, here in this code, so - * skip this inode. - */ - if (!write_trylock(&tree->lock)) { - up_read(&inode->i_mmap_lock); - return 0; - } - node = rb_first(&tree->root); while (node) { struct rb_node *next = rb_next(node); @@ -1201,36 +1188,89 @@ next: * lock. This is to avoid slowing other tasks trying to take the * lock. */ - if (need_resched() || rwlock_needbreak(&tree->lock)) + if (need_resched() || rwlock_needbreak(&tree->lock) || + btrfs_fs_closing(fs_info)) break; node = next; } - write_unlock(&tree->lock); up_read(&inode->i_mmap_lock); return nr_dropped; } +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; long nr_dropped = 0; u64 min_ino = ctx->last_ino + 1; - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); min_ino = btrfs_ino(inode) + 1; ctx->last_ino = btrfs_ino(inode); - btrfs_add_delayed_iput(inode); + iput(&inode->vfs_inode); - if (ctx->scanned >= ctx->nr_to_scan) + if (ctx->scanned >= ctx->nr_to_scan || + btrfs_fs_closing(fs_info)) break; cond_resched(); - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); } if (inode) { @@ -1254,16 +1294,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx return nr_dropped; } -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +static void btrfs_extent_map_shrinker_worker(struct work_struct *work) { + struct btrfs_fs_info *fs_info; struct btrfs_em_shrink_ctx ctx; u64 start_root_id; u64 next_root_id; bool cycled = false; long nr_dropped = 0; + fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work); + ctx.scanned = 0; - ctx.nr_to_scan = nr_to_scan; + ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan); /* * In case we have multiple tasks running this shrinker, make the next @@ -1281,12 +1324,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan, nr, ctx.last_root, ctx.last_ino); } - while (ctx.scanned < ctx.nr_to_scan) { + while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { struct btrfs_root *root; unsigned long count; @@ -1344,5 +1387,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) ctx.last_ino); } - return nr_dropped; + atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0); +} + +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +{ + /* + * Do nothing if the shrinker is already running. In case of high memory + * pressure we can have a lot of tasks calling us and all passing the + * same nr_to_scan value, but in reality we may need only to free + * nr_to_scan extent maps (or less). In case we need to free more than + * that, we will be called again by the fs shrinker, so no worries about + * not doing enough work to reclaim memory from extent maps. + * We can also be repeatedly called with the same nr_to_scan value + * simply because the shrinker runs asynchronously and multiple calls + * to this function are made before the shrinker does enough progress. + * + * That's why we set the atomic counter to nr_to_scan only if its + * current value is zero, instead of incrementing the counter by + * nr_to_scan. + */ + if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0) + return; + + queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work); +} + +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) +{ + atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0); + INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker); } |