diff options
Diffstat (limited to 'fs/bcachefs/disk_accounting.c')
-rw-r--r-- | fs/bcachefs/disk_accounting.c | 203 |
1 files changed, 132 insertions, 71 deletions
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 07eb8fa1b026..1f0422bfae35 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -79,6 +79,8 @@ static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_ memcpy_u64s_small(acc->v.d, d, nr); } +static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); + int bch2_disk_accounting_mod(struct btree_trans *trans, struct disk_accounting_pos *k, s64 *d, unsigned nr, bool gc) @@ -96,19 +98,25 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, accounting_key_init(&k_i.k, k, d, nr); - return likely(!gc) - ? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k) - : bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + if (unlikely(gc)) { + int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + if (ret == -BCH_ERR_btree_insert_need_mark_replicas) + ret = drop_locks_do(trans, + bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: + bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + return ret; + } else { + return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); + } } int bch2_mod_dev_cached_sectors(struct btree_trans *trans, unsigned dev, s64 sectors, bool gc) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; - + struct disk_accounting_pos acc; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_replicas_entry_cached(&acc.replicas, dev); return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); @@ -126,15 +134,22 @@ static inline bool is_zero(char *start, char *end) #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) +static const unsigned bch2_accounting_type_nr_counters[] = { +#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr, + BCH_DISK_ACCOUNTING_TYPES() +#undef x +}; + int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); void *end = &acc_k + 1; int ret = 0; - bkey_fsck_err_on(bversion_zero(k.k->bversion), + bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && + bversion_zero(k.k->bversion), c, accounting_key_version_0, "accounting key with version=0"); @@ -183,6 +198,11 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), c, accounting_key_junk_at_end, "junk at end of accounting key"); + + bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], + c, accounting_key_nr_counters_wrong, + "accounting key with %u counters, should be %u", + bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); fsck_err: return ret; } @@ -217,7 +237,8 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po prt_printf(out, "id=%u", k->snapshot.id); break; case BCH_DISK_ACCOUNTING_btree: - prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id)); + prt_str(out, "btree="); + bch2_btree_id_to_text(out, k->btree.id); break; } } @@ -243,10 +264,10 @@ void bch2_accounting_swab(struct bkey_s k) } static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, - struct disk_accounting_pos acc) + struct disk_accounting_pos *acc) { - unsafe_memcpy(r, &acc.replicas, - replicas_entry_bytes(&acc.replicas), + unsafe_memcpy(r, &acc->replicas, + replicas_entry_bytes(&acc->replicas), "variable length struct"); } @@ -257,7 +278,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc switch (acc_k.type) { case BCH_DISK_ACCOUNTING_replicas: - __accounting_to_replicas(r, acc_k); + __accounting_to_replicas(r, &acc_k); return true; default: return false; @@ -322,6 +343,14 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); + + if (trace_accounting_mem_insert_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_accounting_to_text(&buf, c, a.s_c); + trace_accounting_mem_insert(c, buf.buf); + printbuf_exit(&buf); + } return 0; err: free_percpu(n.v[1]); @@ -347,6 +376,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, return ret; } +int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) +{ + struct bch_replicas_padded r; + + if (mode != BCH_ACCOUNTING_read && + accounting_to_replicas(&r.e, a.k->p) && + !bch2_replicas_marked_locked(c, &r.e)) + return -BCH_ERR_btree_insert_need_mark_replicas; + + return __bch2_accounting_mem_insert(c, a); +} + static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) { for (unsigned i = 0; i < e->nr_counters; i++) @@ -461,32 +503,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc return ret; } -void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - - percpu_down_read(&c->mark_lock); - out->atomic++; - - eytzinger0_for_each(i, acc->k.nr) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos); - - bch2_accounting_key_to_text(out, &acc_k); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); - - prt_str(out, ":"); - for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) - prt_printf(out, " %llu", v[j]); - prt_newline(out); - } - - --out->atomic; - percpu_up_read(&c->mark_lock); -} - static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) { darray_for_each(acc->k, e) { @@ -580,7 +596,7 @@ int bch2_gc_accounting_done(struct bch_fs *c) accounting_key_init(&k_i.k, &acc_k, src_v, nr); bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), - BCH_ACCOUNTING_normal); + BCH_ACCOUNTING_normal, true); preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); @@ -609,7 +625,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) percpu_down_read(&c->mark_lock); int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), - BCH_ACCOUNTING_read); + BCH_ACCOUNTING_read, false); percpu_up_read(&c->mark_lock); return ret; } @@ -625,7 +641,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, switch (acc.type) { case BCH_DISK_ACCOUNTING_replicas: { struct bch_replicas_padded r; - __accounting_to_replicas(&r.e, acc); + __accounting_to_replicas(&r.e, &acc); for (unsigned i = 0; i < r.e.nr_devs; i++) if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && @@ -642,7 +658,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n %s", + "accounting not marked in superblock replicas\n%s", (printbuf_reset(&buf), bch2_accounting_key_to_text(&buf, &acc), buf.buf))) { @@ -672,7 +688,7 @@ fsck_err: return ret; invalid_device: if (fsck_err(trans, accounting_to_invalid_device, - "accounting entry points to invalid device %i\n %s", + "accounting entry points to invalid device %i\n%s", invalid_dev, (printbuf_reset(&buf), bch2_accounting_key_to_text(&buf, &acc), @@ -699,11 +715,47 @@ int bch2_accounting_read(struct bch_fs *c) struct btree_trans *trans = bch2_trans_get(c); struct printbuf buf = PRINTBUF; - int ret = for_each_btree_key(trans, iter, - BTREE_ID_accounting, POS_MIN, + /* + * We might run more than once if we rewind to start topology repair or + * btree node scan - and those might cause us to get different results, + * so we can't just skip if we've already run. + * + * Instead, zero out any accounting we have: + */ + percpu_down_write(&c->mark_lock); + darray_for_each(acc->k, e) + percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); + for_each_member_device(c, ca) + percpu_memset(ca->usage, 0, sizeof(*ca->usage)); + percpu_memset(c->usage, 0, sizeof(*c->usage)); + percpu_up_write(&c->mark_lock); + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + iter.flags &= ~BTREE_ITER_with_journal; + int ret = for_each_btree_key_continue(trans, iter, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); + + if (k.k->type != KEY_TYPE_accounting) + continue; + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; + + if (!bch2_accounting_is_mem(acc_k)) { + struct disk_accounting_pos next; + memset(&next, 0, sizeof(next)); + next.type = acc_k.type + 1; + bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); + continue; + } + accounting_read_key(trans, k); })); if (ret) @@ -715,6 +767,12 @@ int bch2_accounting_read(struct bch_fs *c) darray_for_each(*keys, i) { if (i->k->k.type == KEY_TYPE_accounting) { + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); + + if (!bch2_accounting_is_mem(acc_k)) + continue; + struct bkey_s_c k = bkey_i_to_s_c(i->k); unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), @@ -748,15 +806,16 @@ int bch2_accounting_read(struct bch_fs *c) keys->gap = keys->nr = dst - keys->data; percpu_down_write(&c->mark_lock); - unsigned i = 0; - while (i < acc->k.nr) { - unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); + darray_for_each_reverse(acc->k, i) { struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); + bpos_to_disk_accounting_pos(&acc_k, i->pos); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); + memset(v, 0, sizeof(v)); + + for (unsigned j = 0; j < i->nr_counters; j++) + v[j] = percpu_u64_get(i->v[0] + j); /* * If the entry counters are zeroed, it should be treated as @@ -765,26 +824,25 @@ int bch2_accounting_read(struct bch_fs *c) * Remove it, so that if it's re-added it gets re-marked in the * superblock: */ - ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) + ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, acc_k, - v, acc->k.data[idx].nr_counters); + : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(acc->k.data[idx].v[0]); - free_percpu(acc->k.data[idx].v[1]); - darray_remove_item(&acc->k, &acc->k.data[idx]); - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); + free_percpu(i->v[0]); + free_percpu(i->v[1]); + darray_remove_item(&acc->k, i); ret = 0; continue; } if (ret) goto fsck_err; - i++; } + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + preempt_disable(); struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); @@ -804,7 +862,7 @@ int bch2_accounting_read(struct bch_fs *c) break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); if (ca) { struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; percpu_u64_set(&d->buckets, v[0]); @@ -849,15 +907,13 @@ int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) int bch2_dev_usage_init(struct bch_dev *ca, bool gc) { struct bch_fs *c = ca->fs; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = BCH_DATA_free, - }; u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; int ret = bch2_trans_do(c, ({ - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?: + bch2_disk_accounting_mod2(trans, gc, + v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free) ?: (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); })); bch_err_fn(c, ret); @@ -881,10 +937,15 @@ void bch2_verify_accounting_clean(struct bch_fs *c) bpos_to_disk_accounting_pos(&acc_k, k.k->p); if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - continue; + break; - if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + if (!bch2_accounting_is_mem(acc_k)) { + struct disk_accounting_pos next; + memset(&next, 0, sizeof(next)); + next.type = acc_k.type + 1; + bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); continue; + } bch2_accounting_mem_read(c, k.k->p, v, nr); @@ -910,7 +971,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) break; case BCH_DISK_ACCOUNTING_dev_data_type: { rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (!ca) { rcu_read_unlock(); continue; |