summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2024-09-05 03:49:37 +0300
committerKent Overstreet <kent.overstreet@linux.dev>2024-09-21 18:39:48 +0300
commit7a51608d0125469664e2daf8e060d6d783924c98 (patch)
tree4fa41c4ed7b94e602ad97f8ed73da7e4b56f6893
parent91ddd7151000c0e538cec7fb2f3f86e2268af4d4 (diff)
downloadlinux-7a51608d0125469664e2daf8e060d6d783924c98.tar.xz
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check references to another: extents <-> backpointers Checking references generates random lookups, so we want to pin that btree in memory (or only a range, if it doesn't fit in ram). Previously, this was done with a simple check in the shrinker - "if btree node is in range being pinned, don't free it" - but this generated OOMs, as our shrinker wasn't well behaved if there was less memory available than expected. Instead, we now have two different shrinkers and lru lists; the second shrinker being for pinned nodes, with seeks set much higher than normal - so they can still be freed if necessary, but we'll prefer not to. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/backpointers.c13
-rw-r--r--fs/bcachefs/btree_cache.c164
-rw-r--r--fs/bcachefs/btree_cache.h3
-rw-r--r--fs/bcachefs/btree_types.h21
-rw-r--r--fs/bcachefs/btree_update_interior.c2
-rw-r--r--fs/bcachefs/journal_reclaim.c7
-rw-r--r--fs/bcachefs/sysfs.c15
7 files changed, 150 insertions, 75 deletions
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 6c395445f60c..b88d1801652c 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -752,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
+ bch2_btree_cache_unpin(c);
+
btree_interior_mask |= btree_leaf_mask;
- c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
- c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
+ c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
+ c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
c->btree_cache.pinned_nodes_start = start;
c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
@@ -777,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
BBPOS(btree, b->key.k.p);
break;
}
+ bch2_node_pin(c, b);
0;
}));
}
@@ -936,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
- c->btree_cache.pinned_nodes_leaf_mask = 0;
- c->btree_cache.pinned_nodes_interior_mask = 0;
+ bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
@@ -1053,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
- c->btree_cache.pinned_nodes_leaf_mask = 0;
- c->btree_cache.pinned_nodes_interior_mask = 0;
+ bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index e66853b98857..6e4afb2b5441 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -47,9 +47,14 @@ void bch2_recalc_btree_reserve(struct bch_fs *c)
c->btree_cache.nr_reserve = reserve;
}
-static inline size_t btree_cache_can_free(struct btree_cache *bc)
+static inline size_t btree_cache_can_free(struct btree_cache_list *list)
{
- return max_t(int, 0, bc->nr_live + bc->nr_freeable - bc->nr_reserve);
+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+
+ size_t can_free = list->nr;
+ if (!list->idx)
+ can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
+ return can_free;
}
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
@@ -184,6 +189,51 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
six_unlock_intent(&b->c.lock);
}
+static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
+{
+ struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+ u64 mask = bc->pinned_nodes_mask[!!b->c.level];
+
+ return ((mask & BIT_ULL(b->c.btree_id)) &&
+ bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+ bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
+}
+
+void bch2_node_pin(struct bch_fs *c, struct btree *b)
+{
+ struct btree_cache *bc = &c->btree_cache;
+
+ mutex_lock(&bc->lock);
+ BUG_ON(!__btree_node_pinned(bc, b));
+ if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
+ set_btree_node_pinned(b);
+ list_move(&b->list, &bc->live[1].list);
+ bc->live[0].nr--;
+ bc->live[1].nr++;
+ }
+ mutex_unlock(&bc->lock);
+}
+
+void bch2_btree_cache_unpin(struct bch_fs *c)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b, *n;
+
+ mutex_lock(&bc->lock);
+ c->btree_cache.pinned_nodes_mask[0] = 0;
+ c->btree_cache.pinned_nodes_mask[1] = 0;
+
+ list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
+ clear_btree_node_pinned(b);
+ list_move(&b->list, &bc->live[0].list);
+ bc->live[0].nr++;
+ bc->live[1].nr--;
+ }
+
+ mutex_unlock(&bc->lock);
+}
+
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
@@ -199,7 +249,7 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
if (b->c.btree_id < BTREE_ID_NR)
--bc->nr_by_btree[b->c.btree_id];
- bc->nr_live--;
+ bc->live[btree_node_pinned(b)].nr--;
bc->nr_freeable++;
list_move(&b->list, &bc->freeable);
}
@@ -216,9 +266,14 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
if (b->c.btree_id < BTREE_ID_NR)
bc->nr_by_btree[b->c.btree_id]++;
- bc->nr_live++;
+
+ bool p = __btree_node_pinned(bc, b);
+ mod_bit(BTREE_NODE_pinned, &b->flags, p);
+
+ list_move_tail(&b->list, &bc->live[p].list);
+ bc->live[p].nr++;
+
bc->nr_freeable--;
- list_move_tail(&b->list, &bc->live);
return 0;
}
@@ -283,20 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b
int ret = 0;
lockdep_assert_held(&bc->lock);
-
- struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
-
- u64 mask = b->c.level
- ? bc->pinned_nodes_interior_mask
- : bc->pinned_nodes_leaf_mask;
-
- if ((mask & BIT_ULL(b->c.btree_id)) &&
- bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
- bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) {
- BTREE_CACHE_NOT_FREED_INCREMENT(pinned);
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
- }
-
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
@@ -401,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = shrink->private_data;
- struct btree_cache *bc = &c->btree_cache;
+ struct btree_cache_list *list = shrink->private_data;
+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
unsigned long can_free = 0;
@@ -410,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long touched = 0;
unsigned i, flags;
unsigned long ret = SHRINK_STOP;
- bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >=
- (bc->nr_live + bc->nr_freeable) * 3 / 4;
+ bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
@@ -426,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
* succeed, so that inserting keys into the btree can always succeed and
* IO can always make forward progress:
*/
- can_free = btree_cache_can_free(bc);
+ can_free = btree_cache_can_free(list);
nr = min_t(unsigned long, nr, can_free);
i = 0;
@@ -452,7 +493,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
}
}
restart:
- list_for_each_entry_safe(b, t, &bc->live, list) {
+ list_for_each_entry_safe(b, t, &list->list, list) {
touched++;
if (btree_node_accessed(b)) {
@@ -476,7 +517,7 @@ restart:
!btree_node_will_make_reachable(b) &&
!btree_node_write_blocked(b) &&
six_trylock_read(&b->c.lock)) {
- list_move(&bc->live, &b->list);
+ list_move(&list->list, &b->list);
mutex_unlock(&bc->lock);
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_read(&b->c.lock);
@@ -490,8 +531,8 @@ restart:
break;
}
out_rotate:
- if (&t->list != &bc->live)
- list_move_tail(&bc->live, &t->list);
+ if (&t->list != &list->list)
+ list_move_tail(&list->list, &t->list);
out:
mutex_unlock(&bc->lock);
out_nounlock:
@@ -504,40 +545,42 @@ out_nounlock:
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = shrink->private_data;
- struct btree_cache *bc = &c->btree_cache;
+ struct btree_cache_list *list = shrink->private_data;
if (bch2_btree_shrinker_disabled)
return 0;
- return btree_cache_can_free(bc);
+ return btree_cache_can_free(list);
}
void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *t;
- unsigned i, flags;
+ unsigned long flags;
- shrinker_free(bc->shrink);
+ shrinker_free(bc->live[1].shrink);
+ shrinker_free(bc->live[0].shrink);
/* vfree() can allocate memory: */
flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
if (c->verify_data)
- list_move(&c->verify_data->list, &bc->live);
+ list_move(&c->verify_data->list, &bc->live[0].list);
kvfree(c->verify_ondisk);
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
- list_add(&r->b->list, &bc->live);
+ list_add(&r->b->list, &bc->live[0].list);
}
- list_for_each_entry_safe(b, t, &bc->live, list)
+ list_for_each_entry_safe(b, t, &bc->live[1].list, list)
+ bch2_btree_node_hash_remove(bc, b);
+ list_for_each_entry_safe(b, t, &bc->live[0].list, list)
bch2_btree_node_hash_remove(bc, b);
list_for_each_entry_safe(b, t, &bc->freeable, list) {
@@ -563,7 +606,8 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
BUG_ON(bc->nr_by_btree[i]);
- BUG_ON(bc->nr_live);
+ BUG_ON(bc->live[0].nr);
+ BUG_ON(bc->live[1].nr);
BUG_ON(bc->nr_freeable);
if (bc->table_init_done)
@@ -589,18 +633,28 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
if (!__bch2_btree_node_mem_alloc(c))
goto err;
- list_splice_init(&bc->live, &bc->freeable);
+ list_splice_init(&bc->live[0].list, &bc->freeable);
mutex_init(&c->verify_lock);
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
if (!shrink)
goto err;
- bc->shrink = shrink;
+ bc->live[0].shrink = shrink;
+ shrink->count_objects = bch2_btree_cache_count;
+ shrink->scan_objects = bch2_btree_cache_scan;
+ shrink->seeks = 2;
+ shrink->private_data = &bc->live[0];
+ shrinker_register(shrink);
+
+ shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
+ if (!shrink)
+ goto err;
+ bc->live[1].shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
- shrink->seeks = 4;
- shrink->private_data = c;
+ shrink->seeks = 8;
+ shrink->private_data = &bc->live[1];
shrinker_register(shrink);
return 0;
@@ -611,7 +665,10 @@ err:
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
{
mutex_init(&bc->lock);
- INIT_LIST_HEAD(&bc->live);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
+ bc->live[i].idx = i;
+ INIT_LIST_HEAD(&bc->live[i].list);
+ }
INIT_LIST_HEAD(&bc->freeable);
INIT_LIST_HEAD(&bc->freed_pcpu);
INIT_LIST_HEAD(&bc->freed_nonpcpu);
@@ -673,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_reclaim(c, b, false))
- return b;
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+ list_for_each_entry_reverse(b, &bc->live[i].list, list)
+ if (!btree_node_reclaim(c, b, false))
+ return b;
while (1) {
- list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_write_and_reclaim(c, b))
- return b;
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+ list_for_each_entry_reverse(b, &bc->live[i].list, list)
+ if (!btree_node_write_and_reclaim(c, b))
+ return b;
/*
* Rare case: all nodes were intent-locked.
@@ -1387,9 +1446,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
- prt_btree_cache_line(out, c, "nr_live:", bc->nr_live);
- prt_btree_cache_line(out, c, "nr_freeable:", bc->nr_freeable);
- prt_btree_cache_line(out, c, "nr dirty:", atomic_long_read(&bc->nr_dirty));
+ prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
+ prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
+ prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
+ prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
prt_newline(out);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index f82064007127..367acd217c6a 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
+void bch2_node_pin(struct bch_fs *, struct btree *);
+void bch2_btree_cache_unpin(struct bch_fs *);
+
void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_i *);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index ee3df2a486cc..4568a41fefaf 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -147,8 +147,7 @@ struct btree {
x(noevict) \
x(write_blocked) \
x(will_make_reachable) \
- x(access_bit) \
- x(pinned) \
+ x(access_bit)
enum bch_btree_cache_not_freed_reasons {
#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
@@ -157,6 +156,13 @@ enum bch_btree_cache_not_freed_reasons {
BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
};
+struct btree_cache_list {
+ unsigned idx;
+ struct shrinker *shrink;
+ struct list_head list;
+ size_t nr;
+};
+
struct btree_cache {
struct rhashtable table;
bool table_init_done;
@@ -174,12 +180,11 @@ struct btree_cache {
* should never grow past ~2-3 nodes in practice.
*/
struct mutex lock;
- struct list_head live;
struct list_head freeable;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
+ struct btree_cache_list live[2];
- size_t nr_live;
size_t nr_freeable;
size_t nr_reserve;
size_t nr_by_btree[BTREE_ID_NR];
@@ -188,7 +193,6 @@ struct btree_cache {
/* shrinker stats */
size_t nr_freed;
u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
- struct shrinker *shrink;
/*
* If we need to allocate memory for a new btree node and that
@@ -201,8 +205,8 @@ struct btree_cache {
struct bbpos pinned_nodes_start;
struct bbpos pinned_nodes_end;
- u64 pinned_nodes_leaf_mask;
- u64 pinned_nodes_interior_mask;
+ /* btree id mask: 0 for leaves, 1 for interior */
+ u64 pinned_nodes_mask[2];
};
struct btree_node_iter {
@@ -594,7 +598,8 @@ enum btree_write_type {
x(dying) \
x(fake) \
x(need_rewrite) \
- x(never_write)
+ x(never_write) \
+ x(pinned)
enum btree_flags {
/* First bits for btree node write type */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 18494a662e0a..190bc1e81756 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1904,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
six_unlock_intent(&n->c.lock);
mutex_lock(&c->btree_cache.lock);
- list_add_tail(&b->list, &c->btree_cache.live);
+ list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
mutex_unlock(&c->btree_cache.lock);
bch2_trans_verify_locks(trans);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index f8e045982753..ace291f175dd 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j)
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct btree_cache *bc = &c->btree_cache;
bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush;
size_t min_nr, min_key_cache, nr_flushed;
@@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
- if (atomic_long_read(&c->btree_cache.nr_dirty) * 2 > c->btree_cache.nr_live)
+ size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
+ if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
min_nr = 1;
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
@@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
- atomic_long_read(&c->btree_cache.nr_dirty),
- c->btree_cache.nr_live,
+ atomic_long_read(&bc->nr_dirty), btree_cache_live,
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 6791540d6a4a..03e59f86f360 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -244,14 +244,18 @@ static struct attribute sysfs_state_rw = {
static size_t bch2_btree_cache_size(struct bch_fs *c)
{
+ struct btree_cache *bc = &c->btree_cache;
size_t ret = 0;
struct btree *b;
- mutex_lock(&c->btree_cache.lock);
- list_for_each_entry(b, &c->btree_cache.live, list)
+ mutex_lock(&bc->lock);
+ list_for_each_entry(b, &bc->live[0].list, list)
ret += btree_buf_bytes(b);
-
- mutex_unlock(&c->btree_cache.lock);
+ list_for_each_entry(b, &bc->live[1].list, list)
+ ret += btree_buf_bytes(b);
+ list_for_each_entry(b, &bc->freeable, list)
+ ret += btree_buf_bytes(b);
+ mutex_unlock(&bc->lock);
return ret;
}
@@ -444,11 +448,12 @@ STORE(bch2_fs)
return -EROFS;
if (attr == &sysfs_trigger_btree_cache_shrink) {
+ struct btree_cache *bc = &c->btree_cache;
struct shrink_control sc;
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
+ bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
}
if (attr == &sysfs_trigger_btree_key_cache_shrink) {