diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2024-07-12 18:24:12 +0300 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2024-07-12 18:24:12 +0300 |
commit | c8b8b8190a80b591aa73c27c70a668799f8db547 (patch) | |
tree | 9d948c9aac89678abe64ac81f6c43348bf4b2091 /fs | |
parent | f0a23883fad4ec8a63faddb9639a92be2e007624 (diff) | |
parent | 492ac37fa38faf520b5beae44c930063265ee183 (diff) | |
download | linux-c8b8b8190a80b591aa73c27c70a668799f8db547.tar.xz |
Merge tag 'loongarch-kvm-6.11' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson into HEAD
LoongArch KVM changes for v6.11
1. Add ParaVirt steal time support.
2. Add some VM migration enhancement.
3. Add perf kvm-stat support for loongarch.
Diffstat (limited to 'fs')
130 files changed, 2038 insertions, 1388 deletions
diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 15bb7989c387..3acf5e050072 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -512,7 +512,7 @@ static int afs_iget5_set_root(struct inode *inode, void *opaque) struct afs_vnode *vnode = AFS_FS_I(inode); vnode->volume = as->volume; - vnode->fid.vid = as->volume->vid, + vnode->fid.vid = as->volume->vid; vnode->fid.vnode = 1; vnode->fid.unique = 1; inode->i_ino = 1; @@ -545,7 +545,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) BUG_ON(!(inode->i_state & I_NEW)); vnode = AFS_FS_I(inode); - vnode->cb_v_check = atomic_read(&as->volume->cb_v_break), + vnode->cb_v_check = atomic_read(&as->volume->cb_v_break); afs_set_netfs_context(vnode); op = afs_alloc_operation(key, as->volume); diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 346cd91f91f9..1de9fac3bcf4 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -29,7 +29,7 @@ #include <linux/sched/task.h> #include <linux/sort.h> -static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); +static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); /* Persistent alloc info: */ @@ -259,6 +259,14 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, "invalid data type (got %u should be %u)", a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + for (unsigned i = 0; i < 2; i++) + bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX, + c, err, + alloc_key_io_time_bad, + "invalid io_time[%s]: %llu, max %llu", + i == READ ? "read" : "write", + a.v->io_time[i], LRU_TIME_MAX); + switch (a.v->data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: @@ -741,6 +749,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; int ret = 0; struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); @@ -756,8 +765,8 @@ int bch2_trigger_alloc(struct btree_trans *trans, alloc_data_type_set(new_a, new_a->data_type); if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + new_a->io_time[READ] = bch2_current_io_time(c, READ); + new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } @@ -767,6 +776,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + alloc_data_type_set(new_a, new_a->data_type); } if (old_a->data_type != new_a->data_type || @@ -780,7 +790,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (new_a->data_type == BCH_DATA_cached && !new_a->io_time[READ]) - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[READ] = bch2_current_io_time(c, READ); u64 old_lru = alloc_lru_idx_read(*old_a); u64 new_lru = alloc_lru_idx_read(*new_a); @@ -860,8 +870,14 @@ int bch2_trigger_alloc(struct btree_trans *trans, } percpu_down_read(&c->mark_lock); - if (new_a->gen != old_a->gen) - *bucket_gen(ca, new.k->p.offset) = new_a->gen; + if (new_a->gen != old_a->gen) { + u8 *gen = bucket_gen(ca, new.k->p.offset); + if (unlikely(!gen)) { + percpu_up_read(&c->mark_lock); + goto invalid_bucket; + } + *gen = new_a->gen; + } bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); percpu_up_read(&c->mark_lock); @@ -875,14 +891,14 @@ int bch2_trigger_alloc(struct btree_trans *trans, closure_wake_up(&c->freelist_wait); if (statechange(a->data_type == BCH_DATA_need_discard) && - !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && + !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && bucket_flushed(new_a)) - bch2_discard_one_bucket_fast(c, new.k->p); + bch2_discard_one_bucket_fast(ca, new.k->p.offset); if (statechange(a->data_type == BCH_DATA_cached) && !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); if (statechange(a->data_type == BCH_DATA_need_gc_gens)) bch2_gc_gens_async(c); @@ -895,6 +911,11 @@ int bch2_trigger_alloc(struct btree_trans *trans, percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, new.k->p.offset); + if (unlikely(!g)) { + percpu_up_read(&c->mark_lock); + goto invalid_bucket; + } + g->gen_valid = 1; bucket_lock(g); @@ -910,8 +931,14 @@ int bch2_trigger_alloc(struct btree_trans *trans, percpu_up_read(&c->mark_lock); } err: + printbuf_exit(&buf); bch2_dev_put(ca); return ret; +invalid_bucket: + bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", + (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); + ret = -EIO; + goto err; } /* @@ -1561,7 +1588,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) goto err; - a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); ret = bch2_trans_update(trans, alloc_iter, &a_mut->k_i, BTREE_TRIGGER_norun); if (ret) @@ -1609,34 +1636,38 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) return ret; } -static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) +static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) { int ret; - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) - if (bkey_eq(*i, bucket)) { - ret = -EEXIST; + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { + ret = -BCH_ERR_EEXIST_discard_in_flight_add; goto out; } - ret = darray_push(&c->discard_buckets_in_flight, bucket); + ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { + .in_progress = in_progress, + .bucket = bucket, + })); out: - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); return ret; } -static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) +static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) { - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) - if (bkey_eq(*i, bucket)) { - darray_remove_item(&c->discard_buckets_in_flight, i); + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { + BUG_ON(!i->in_progress); + darray_remove_item(&ca->discard_buckets_in_flight, i); goto found; } BUG(); found: - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); } struct discard_buckets_state { @@ -1644,26 +1675,11 @@ struct discard_buckets_state { u64 open; u64 need_journal_commit; u64 discarded; - struct bch_dev *ca; u64 need_journal_commit_this_dev; }; -static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) -{ - if (s->ca == ca) - return; - - if (s->ca && s->need_journal_commit_this_dev > - bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) - bch2_journal_flush_async(&c->journal, NULL); - - if (s->ca) - percpu_ref_put(&s->ca->io_ref); - s->ca = ca; - s->need_journal_commit_this_dev = 0; -} - static int bch2_discard_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, struct discard_buckets_state *s) @@ -1677,16 +1693,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, bool discard_locked = false; int ret = 0; - struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode - ? s->ca - : bch2_dev_get_ioref(c, pos.inode, WRITE); - if (!ca) { - bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); - return 0; - } - - discard_buckets_next_dev(c, s, ca); - if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { s->open++; goto out; @@ -1746,7 +1752,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) + if (discard_in_flight_add(ca, iter.pos.offset, true)) goto out; discard_locked = true; @@ -1770,8 +1776,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - alloc_data_type_set(&a->v, a->v.data_type); write: + alloc_data_type_set(&a->v, a->v.data_type); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| @@ -1783,7 +1790,7 @@ write: s->discarded++; out: if (discard_locked) - discard_in_flight_remove(c, iter.pos); + discard_in_flight_remove(ca, iter.pos.offset); s->seen++; bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); @@ -1792,7 +1799,8 @@ out: static void bch2_do_discards_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, discard_work); + struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); + struct bch_fs *c = ca->fs; struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1803,23 +1811,41 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); - - discard_buckets_next_dev(c, &s, NULL); + for_each_btree_key_upto(trans, iter, + BTREE_ID_need_discard, + POS(ca->dev_idx, 0), + POS(ca->dev_idx, U64_MAX), 0, k, + bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_write_ref_put(c, BCH_WRITE_REF_discard); + percpu_ref_put(&ca->io_ref); +} + +void bch2_dev_do_discards(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->discard_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); +put_ioref: + percpu_ref_put(&ca->io_ref); } void bch2_do_discards(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && - !queue_work(c->write_ref_wq, &c->discard_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + for_each_member_device(c, ca) + bch2_dev_do_discards(ca); } static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) @@ -1848,68 +1874,69 @@ err: static void bch2_do_discards_fast_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); + struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); + struct bch_fs *c = ca->fs; while (1) { bool got_bucket = false; - struct bpos bucket; - struct bch_dev *ca; + u64 bucket; - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) { - if (i->snapshot) + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) { + if (i->in_progress) continue; - ca = bch2_dev_get_ioref(c, i->inode, WRITE); - if (!ca) { - darray_remove_item(&c->discard_buckets_in_flight, i); - continue; - } - got_bucket = true; - bucket = *i; - i->snapshot = true; + bucket = i->bucket; + i->in_progress = true; break; } - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); if (!got_bucket) break; if (ca->mi.discard && !c->opts.nochanges) blkdev_issue_discard(ca->disk_sb.bdev, - bucket.offset * ca->mi.bucket_size, + bucket_to_sector(ca, bucket), ca->mi.bucket_size, GFP_KERNEL); int ret = bch2_trans_do(c, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc, - bch2_clear_bucket_needs_discard(trans, bucket)); + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, + bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); bch_err_fn(c, ret); - percpu_ref_put(&ca->io_ref); - discard_in_flight_remove(c, bucket); + discard_in_flight_remove(ca, bucket); if (ret) break; } bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + percpu_ref_put(&ca->io_ref); } -static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) +static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) { - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode); - bool dead = !ca || percpu_ref_is_dying(&ca->io_ref); - rcu_read_unlock(); + struct bch_fs *c = ca->fs; + + if (discard_in_flight_add(ca, bucket, false)) + return; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; - if (!dead && - !discard_in_flight_add(c, bucket) && - bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && - !queue_work(c->write_ref_wq, &c->discard_fast_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +put_ioref: + percpu_ref_put(&ca->io_ref); } static int invalidate_one_bucket(struct btree_trans *trans, @@ -1957,8 +1984,8 @@ static int invalidate_one_bucket(struct btree_trans *trans, a->v.data_type = 0; a->v.dirty_sectors = 0; a->v.cached_sectors = 0; - a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); - a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); + a->v.io_time[READ] = bch2_current_io_time(c, READ); + a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); ret = bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| @@ -1993,9 +2020,25 @@ err: goto out; } +static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, + struct bch_dev *ca, bool *wrapped) +{ + struct bkey_s_c k; +again: + k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + if (!k.k && !*wrapped) { + bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); + *wrapped = true; + goto again; + } + + return k; +} + static void bch2_do_invalidates_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); + struct bch_fs *c = ca->fs; struct btree_trans *trans = bch2_trans_get(c); int ret = 0; @@ -2003,31 +2046,63 @@ static void bch2_do_invalidates_work(struct work_struct *work) if (ret) goto err; - for_each_member_device(c, ca) { - s64 nr_to_invalidate = - should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + s64 nr_to_invalidate = + should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + struct btree_iter iter; + bool wrapped = false; - ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, - lru_pos(ca->dev_idx, 0, 0), - lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), - BTREE_ITER_intent, k, - invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); + bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, + ((bch2_current_io_time(c, READ) + U32_MAX) & + LRU_TIME_MAX)), 0); - if (ret < 0) { - bch2_dev_put(ca); + while (true) { + bch2_trans_begin(trans); + + struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) break; - } + if (!k.k) + break; + + ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); + if (ret) + break; + + bch2_btree_iter_advance(&iter); } + bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + percpu_ref_put(&ca->io_ref); +} + +void bch2_dev_do_invalidates(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->invalidate_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); +put_ioref: + percpu_ref_put(&ca->io_ref); } void bch2_do_invalidates(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && - !queue_work(c->write_ref_wq, &c->invalidate_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + for_each_member_device(c, ca) + bch2_dev_do_invalidates(ca); } int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, @@ -2186,7 +2261,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, if (ret) return ret; - now = atomic64_read(&c->io_clock[rw].now); + now = bch2_current_io_time(c, rw); if (a->v.io_time[rw] == now) goto out; @@ -2343,16 +2418,20 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } -void bch2_fs_allocator_background_exit(struct bch_fs *c) +void bch2_dev_allocator_background_exit(struct bch_dev *ca) +{ + darray_exit(&ca->discard_buckets_in_flight); +} + +void bch2_dev_allocator_background_init(struct bch_dev *ca) { - darray_exit(&c->discard_buckets_in_flight); + mutex_init(&ca->discard_buckets_in_flight_lock); + INIT_WORK(&ca->discard_work, bch2_do_discards_work); + INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work); + INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work); } void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - mutex_init(&c->discard_buckets_in_flight_lock); - INIT_WORK(&c->discard_work, bch2_do_discards_work); - INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); - INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index ae31a94be6f9..ba2c5557a3f0 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -141,7 +141,13 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, !bch2_bucket_sectors_fragmented(ca, a)) return 0; - u64 d = bch2_bucket_sectors_dirty(a); + /* + * avoid overflowing LRU_TIME_BITS on a corrupted fs, when + * bucket_sectors_dirty is (much) bigger than bucket_size + */ + u64 d = min(bch2_bucket_sectors_dirty(a), + ca->mi.bucket_size); + return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } @@ -269,6 +275,7 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, enum btree_iter_update_trigger_flags); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); +void bch2_dev_do_discards(struct bch_dev *); void bch2_do_discards(struct bch_fs *); static inline u64 should_invalidate_buckets(struct bch_dev *ca, @@ -283,6 +290,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); } +void bch2_dev_do_invalidates(struct bch_dev *); void bch2_do_invalidates(struct bch_fs *); static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) @@ -306,7 +314,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -void bch2_fs_allocator_background_exit(struct bch_fs *); +void bch2_dev_allocator_background_exit(struct bch_dev *); +void bch2_dev_allocator_background_init(struct bch_dev *); + void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 927a5f300b30..9d3d64746a5b 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -621,13 +621,13 @@ again: avail = dev_buckets_free(ca, *usage, watermark); if (usage->d[BCH_DATA_need_discard].buckets > avail) - bch2_do_discards(c); + bch2_dev_do_discards(ca); if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) bch2_gc_gens_async(c); if (should_invalidate_buckets(ca, *usage)) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); if (!avail) { if (cl && !waiting) { diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 2a538eb2af11..1106fec6e155 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -493,6 +493,11 @@ struct io_count { u64 sectors[2][BCH_DATA_NR]; }; +struct discard_in_flight { + bool in_progress:1; + u64 bucket:63; +}; + struct bch_dev { struct kobject kobj; #ifdef CONFIG_BCACHEFS_DEBUG @@ -554,6 +559,12 @@ struct bch_dev { size_t inc_gen_really_needs_gc; size_t buckets_waiting_on_journal; + struct work_struct invalidate_work; + struct work_struct discard_work; + struct mutex discard_buckets_in_flight_lock; + DARRAY(struct discard_in_flight) discard_buckets_in_flight; + struct work_struct discard_fast_work; + atomic64_t rebalance_work; struct journal_device journal; @@ -790,7 +801,8 @@ struct bch_fs { /* BTREE CACHE */ struct bio_set btree_bio; - struct workqueue_struct *io_complete_wq; + struct workqueue_struct *btree_read_complete_wq; + struct workqueue_struct *btree_write_submit_wq; struct btree_root btree_roots_known[BTREE_ID_NR]; DARRAY(struct btree_root) btree_roots_extra; @@ -914,11 +926,6 @@ struct bch_fs { unsigned write_points_nr; struct buckets_waiting_for_journal buckets_waiting_for_journal; - struct work_struct invalidate_work; - struct work_struct discard_work; - struct mutex discard_buckets_in_flight_lock; - DARRAY(struct bpos) discard_buckets_in_flight; - struct work_struct discard_fast_work; /* GARBAGE COLLECTION */ struct work_struct gc_gens_work; @@ -1213,6 +1220,11 @@ static inline s64 bch2_current_time(const struct bch_fs *c) return timespec_to_bch2_time(c, now); } +static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw) +{ + return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX); +} + static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) { struct stdio_redirect *stdio = c->stdio; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 90c12fe2a2cd..e3b1bde489c3 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -476,6 +476,9 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) +#define LRU_TIME_BITS 48 +#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -987,8 +990,9 @@ enum bch_version_upgrade_opts { #define BCH_ERROR_ACTIONS() \ x(continue, 0) \ - x(ro, 1) \ - x(panic, 2) + x(fix_safe, 1) \ + x(panic, 2) \ + x(ro, 3) enum bch_error_actions { #define x(t, n) BCH_ON_ERROR_##t = n, @@ -1382,9 +1386,10 @@ enum btree_id { /* * Maximum number of btrees that we will _ever_ have under the current scheme, - * where we refer to them with bitfields + * where we refer to them with 64 bit bitfields - and we also need a bit for + * the interior btree node type: */ -#define BTREE_ID_NR_MAX 64 +#define BTREE_ID_NR_MAX 63 static inline bool btree_id_is_alloc(enum btree_id id) { diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index f46978e5cb7c..94a1d1982fa8 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -1064,7 +1064,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) { const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; u8 *l = k->key_start; - u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; + u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1; while (l < h) { swap(*l, *h); diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index c2c3dae52186..bd32aac05192 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -398,8 +398,12 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, for (i = 0; i < nr_compat; i++) switch (!write ? i : nr_compat - 1 - i) { case 0: - if (big_endian != CPU_BIG_ENDIAN) + if (big_endian != CPU_BIG_ENDIAN) { + bch2_bkey_swab_key(f, k); + } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { bch2_bkey_swab_key(f, k); + bch2_bkey_swab_key(f, k); + } break; case 1: if (version < bcachefs_metadata_version_bkey_renumber) diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 726ef7483763..baef0722f5fb 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -129,7 +129,8 @@ static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, struct bkey_packed *k) { if (version < bcachefs_metadata_version_current || - big_endian != CPU_BIG_ENDIAN) + big_endian != CPU_BIG_ENDIAN || + IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) __bch2_bkey_compat(level, btree_id, version, big_endian, write, f, k); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 9e4ed75d3675..4f5e411771ba 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -91,10 +91,11 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch_btree_cache_params = { - .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, hash_val), - .key_len = sizeof(u64), - .obj_cmpfn = bch2_btree_cache_cmp_fn, + .head_offset = offsetof(struct btree, hash), + .key_offset = offsetof(struct btree, hash_val), + .key_len = sizeof(u64), + .obj_cmpfn = bch2_btree_cache_cmp_fn, + .automatic_shrinking = true, }; static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index dc97991bcd6a..0e477a926579 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -874,6 +874,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, const struct bch_alloc_v4 *old; int ret; + if (!bucket_valid(ca, k.k->p.offset)) + return 0; + old = bch2_alloc_to_v4(k, &old_convert); gc = new = *old; @@ -990,6 +993,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c) buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = ca->mi.nbuckets; + buckets->nbuckets_minus_first = + buckets->nbuckets - buckets->first_bucket; rcu_assign_pointer(ca->buckets_gc, buckets); } @@ -1003,12 +1008,14 @@ static int bch2_gc_alloc_start(struct bch_fs *c) continue; } - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + if (bucket_valid(ca, k.k->p.offset)) { + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - struct bucket *g = gc_bucket(ca, k.k->p.offset); - g->gen_valid = 1; - g->gen = a->gen; + struct bucket *g = gc_bucket(ca, k.k->p.offset); + g->gen_valid = 1; + g->gen = a->gen; + } 0; }))); bch2_dev_put(ca); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 829c1b91477d..7bca15c604f5 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1389,7 +1389,7 @@ static void btree_node_read_endio(struct bio *bio) bch2_latency_acct(ca, rb->start_time, READ); } - queue_work(c->io_complete_wq, &rb->work); + queue_work(c->btree_read_complete_wq, &rb->work); } struct btree_node_read_all { @@ -1656,7 +1656,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool btree_node_read_all_replicas_done(&ra->cl.work); } else { continue_at(&ra->cl, btree_node_read_all_replicas_done, - c->io_complete_wq); + c->btree_read_complete_wq); } return 0; @@ -1737,7 +1737,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, if (sync) btree_node_read_work(&rb->work); else - queue_work(c->io_complete_wq, &rb->work); + queue_work(c->btree_read_complete_wq, &rb->work); } } @@ -2229,7 +2229,7 @@ do_write: atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); INIT_WORK(&wbio->work, btree_write_submit); - queue_work(c->io_complete_wq, &wbio->work); + queue_work(c->btree_write_submit_wq, &wbio->work); return; err: set_btree_node_noevict(b); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index d3bcb4e4e230..0ed9e6574fcd 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -221,11 +221,8 @@ static void bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; - unsigned i; - - EBUG_ON(path->btree_id >= BTREE_ID_NR); - for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { + for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { if (!path->l[i].b) { BUG_ON(!path->cached && bch2_btree_id_root(c, path->btree_id)->b->c.level > i); @@ -251,8 +248,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - BUG_ON(iter->btree_id >= BTREE_ID_NR); - BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_is_extents) && @@ -3135,7 +3130,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); memset(trans, 0, sizeof(*trans)); - closure_init_stack(&trans->ref); seqmutex_lock(&c->btree_trans_lock); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { @@ -3155,15 +3149,10 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) BUG_ON(pos_task && pid == pos_task->pid && pos->locked); - - if (pos_task && pid < pos_task->pid) { - list_add_tail(&trans->list, &pos->list); - goto list_add_done; - } } } - list_add_tail(&trans->list, &c->btree_trans_list); -list_add_done: + + list_add(&trans->list, &c->btree_trans_list); seqmutex_unlock(&c->btree_trans_lock); got_trans: trans->c = c; @@ -3204,6 +3193,8 @@ got_trans: trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; + + closure_init_stack_release(&trans->ref); return trans; } @@ -3240,7 +3231,6 @@ void bch2_trans_put(struct btree_trans *trans) trans_for_each_update(trans, i) __btree_path_put(trans->paths + i->path, true); trans->nr_updates = 0; - trans->locking_wait.task = NULL; check_btree_paths_leaked(trans); @@ -3261,6 +3251,13 @@ void bch2_trans_put(struct btree_trans *trans) if (unlikely(trans->journal_replay_not_finished)) bch2_journal_keys_put(c); + /* + * trans->ref protects trans->locking_wait.task, btree_paths array; used + * by cycle detector + */ + closure_return_sync(&trans->ref); + trans->locking_wait.task = NULL; + unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; trans->paths = NULL; @@ -3278,8 +3275,6 @@ void bch2_trans_put(struct btree_trans *trans) trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); @@ -3385,8 +3380,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); @@ -3406,8 +3399,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) bch2_time_stats_exit(&s->lock_hold_times); } - if (c->btree_trans_barrier_initialized) + if (c->btree_trans_barrier_initialized) { + synchronize_srcu_expedited(&c->btree_trans_barrier); cleanup_srcu_struct(&c->btree_trans_barrier); + } mempool_exit(&c->btree_trans_mem_pool); mempool_exit(&c->btree_trans_pool); } diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 34056aaece00..2d3c0d45c37f 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -32,10 +32,11 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch2_btree_key_cache_params = { - .head_offset = offsetof(struct bkey_cached, hash), - .key_offset = offsetof(struct bkey_cached, key), - .key_len = sizeof(struct bkey_cached_key), - .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .head_offset = offsetof(struct bkey_cached, hash), + .key_offset = offsetof(struct bkey_cached, key), + .key_len = sizeof(struct bkey_cached_key), + .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .automatic_shrinking = true, }; __flatten @@ -840,7 +841,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - freed++; bc->nr_freed_nonpcpu--; bc->freed++; } @@ -854,7 +854,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - freed++; bc->nr_freed_pcpu--; bc->freed++; } @@ -876,23 +875,22 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { bc->skipped_dirty++; - goto next; } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); bc->skipped_accessed++; - goto next; - } else if (bkey_cached_lock_for_evict(ck)) { + } else if (!bkey_cached_lock_for_evict(ck)) { + bc->skipped_lock_fail++; + } else { bkey_cached_evict(bc, ck); bkey_cached_free(bc, ck); bc->moved_to_freelist++; - } else { - bc->skipped_lock_fail++; + freed++; } scanned++; if (scanned >= nr) break; -next: + pos = next; } @@ -917,6 +915,14 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, long nr = atomic_long_read(&bc->nr_keys) - atomic_long_read(&bc->nr_dirty); + /* + * Avoid hammering our shrinker too much if it's nearly empty - the + * shrinker code doesn't take into account how big our cache is, if it's + * mostly empty but the system is under memory pressure it causes nasty + * lock contention: + */ + nr -= 128; + return max(0L, nr); } @@ -1025,9 +1031,10 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) if (!shrink) return -BCH_ERR_ENOMEM_fs_btree_cache_init; bc->shrink = shrink; - shrink->seeks = 0; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; + shrink->batch = 1 << 14; + shrink->seeks = 0; shrink->private_data = c; shrinker_register(shrink); return 0; diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index c3e9b0cc7bbd..d66fff22109a 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -215,6 +215,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) if (unlikely(!best)) { struct printbuf buf = PRINTBUF; + buf.atomic++; prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 45cb8149d374..2cb0442f6cc9 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -72,10 +72,11 @@ static bool found_btree_node_is_readable(struct btree_trans *trans, struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); bool ret = !IS_ERR_OR_NULL(b); - if (ret) { - f->sectors_written = b->written; - six_unlock_read(&b->c.lock); - } + if (!ret) + return ret; + + f->sectors_written = b->written; + six_unlock_read(&b->c.lock); /* * We might update this node's range; if that happens, we need the node diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index d63db4fefe73..87f485e9c552 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -761,13 +761,13 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) static inline bool btree_node_type_is_extents(enum btree_node_type type) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) BCH_BTREE_IDS() #undef x ; - return (1U << type) & mask; + return BIT_ULL(type) & mask; } static inline bool btree_id_is_extents(enum btree_id btree) @@ -777,35 +777,35 @@ static inline bool btree_id_is_extents(enum btree_id btree) static inline bool btree_type_has_snapshots(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } static inline bool btree_type_has_snapshot_field(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } static inline bool btree_type_has_ptrs(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } struct btree_root { diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index ed97712d0db1..743d57eba760 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -465,143 +465,172 @@ int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 return bch2_update_replicas_list(trans, &r.e, sectors); } -int bch2_check_fix_ptrs(struct btree_trans *trans, - enum btree_id btree, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) +static int bch2_check_fix_ptr(struct btree_trans *trans, + struct bkey_s_c k, + struct extent_ptr_decoded p, + const union bch_extent_entry *entry, + bool *do_update) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry_c; - struct extent_ptr_decoded p = { 0 }; - bool do_update = false; struct printbuf buf = PRINTBUF; int ret = 0; - percpu_down_read(&c->mark_lock); + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + if (!ca) { + if (fsck_err(c, ptr_to_invalid_device, + "pointer to missing device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + return 0; + } - bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); - if (!ca) { - if (fsck_err(c, ptr_to_invalid_device, - "pointer to missing device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; - continue; - } + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + if (!g) { + if (fsck_err(c, ptr_to_invalid_device, + "pointer to invalid bucket on device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + goto out; + } - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry_c); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); - if (fsck_err_on(!g->gen_valid, - c, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - } else { - do_update = true; - } + if (fsck_err_on(!g->gen_valid, + c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached) { + g->gen_valid = true; + g->gen = p.ptr.gen; + } else { + *do_update = true; } + } - if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, - c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached && - (g->data_type != BCH_DATA_btree || - data_type == BCH_DATA_btree)) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } else { - do_update = true; - } + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached && + (g->data_type != BCH_DATA_btree || + data_type == BCH_DATA_btree)) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + *do_update = true; + } + } + + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, + c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) + goto out; + + if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), + c, ptr_bucket_data_type_mismatch, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (data_type == BCH_DATA_btree) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = data_type; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + *do_update = true; } + } + + if (p.has_ec) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, - c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + if (fsck_err_on(!m || !m->alive, + c, ptr_to_missing_stripe, + "pointer to nonexistent stripe %llu\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, + (u64) p.ec.idx, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; + *do_update = true; - if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, - c, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), + c, ptr_to_incorrect_stripe, + "pointer does not match stripe %llu\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, + (u64) p.ec.idx, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; + *do_update = true; + } +out: +fsck_err: + bch2_dev_put(ca); + printbuf_exit(&buf); + return ret; +} - if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) - goto next; +int bch2_check_fix_ptrs(struct btree_trans *trans, + enum btree_id btree, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry_c; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; + struct printbuf buf = PRINTBUF; + int ret = 0; - if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), - c, ptr_bucket_data_type_mismatch, - "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = data_type; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } else { - do_update = true; - } - } + percpu_down_read(&c->mark_lock); - if (p.has_ec) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - - if (fsck_err_on(!m || !m->alive, c, - ptr_to_missing_stripe, - "pointer to nonexistent stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; - - if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, - ptr_to_incorrect_stripe, - "pointer does not match stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - do_update = true; - } -next: - bch2_dev_put(ca); + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { + ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); + if (ret) + goto err; } if (do_update) { @@ -716,7 +745,6 @@ found: bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); } err: -fsck_err: percpu_up_read(&c->mark_lock); printbuf_exit(&buf); return ret; @@ -987,6 +1015,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { bool insert = !(flags & BTREE_TRIGGER_overwrite); + struct printbuf buf = PRINTBUF; int ret = 0; struct bch_fs *c = trans->c; @@ -1019,6 +1048,13 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + p.ptr.dev, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + goto err_unlock; + } + bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new); @@ -1027,10 +1063,12 @@ static int bch2_trigger_pointer(struct btree_trans *trans, bch2_dev_usage_update(c, ca, &old, &new, 0, true); } bucket_unlock(g); +err_unlock: percpu_up_read(&c->mark_lock); } err: bch2_dev_put(ca); + printbuf_exit(&buf); return ret; } @@ -1318,10 +1356,11 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b, enum bch_data_type data_type, unsigned sectors, enum btree_iter_update_trigger_flags flags) { - int ret = 0; - percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, b); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", + ca->dev_idx, bch2_data_type_str(data_type))) + goto err_unlock; bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g); @@ -1330,29 +1369,27 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, g->data_type != data_type, c, "different types of data in same bucket: %s, %s", bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type))) { - ret = -EIO; + bch2_data_type_str(data_type))) goto err; - } if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, bch2_data_type_str(g->data_type ?: data_type), - g->dirty_sectors, sectors)) { - ret = -EIO; + g->dirty_sectors, sectors)) goto err; - } g->data_type = data_type; g->dirty_sectors += sectors; struct bch_alloc_v4 new = bucket_m_to_alloc(*g); + bch2_dev_usage_update(c, ca, &old, &new, 0, true); + percpu_up_read(&c->mark_lock); + return 0; err: bucket_unlock(g); - if (!ret) - bch2_dev_usage_update(c, ca, &old, &new, 0, true); +err_unlock: percpu_up_read(&c->mark_lock); - return ret; + return -EIO; } int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, @@ -1595,6 +1632,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; + bucket_gens->nbuckets_minus_first = + bucket_gens->nbuckets - bucket_gens->first_bucket; if (resize) { down_write(&c->gc_lock); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 617ffde2fb7a..80ee0be9793e 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -93,7 +93,8 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { struct bucket_array *buckets = gc_bucket_array(ca); - BUG_ON(!bucket_valid(ca, b)); + if (b - buckets->first_bucket >= buckets->nbuckets_minus_first) + return NULL; return buckets->b + b; } @@ -110,7 +111,8 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) { struct bucket_gens *gens = bucket_gens(ca); - BUG_ON(!bucket_valid(ca, b)); + if (b - gens->first_bucket >= gens->nbuckets_minus_first) + return NULL; return gens->b + b; } @@ -170,19 +172,22 @@ static inline int gen_after(u8 a, u8 b) return r > 0 ? r : 0; } -static inline u8 dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) +static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - return gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + u8 *gen = bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)); + if (!gen) + return -1; + return gen_after(*gen, ptr->gen); } /** * dev_ptr_stale() - check if a pointer points into a bucket that has been * invalidated. */ -static inline u8 dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) +static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { rcu_read_lock(); - u8 ret = dev_ptr_stale_rcu(ca, ptr); + int ret = dev_ptr_stale_rcu(ca, ptr); rcu_read_unlock(); return ret; diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 6a31740222a7..f636e17c4caf 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -22,6 +22,7 @@ struct bucket_array { struct rcu_head rcu; u16 first_bucket; size_t nbuckets; + size_t nbuckets_minus_first; struct bucket b[]; }; @@ -29,6 +30,7 @@ struct bucket_gens { struct rcu_head rcu; u16 first_bucket; size_t nbuckets; + size_t nbuckets_minus_first; u8 b[]; }; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 9e54323f0f5f..6d82e1165adc 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -216,7 +216,8 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a ret = PTR_ERR_OR_ZERO(optstr) ?: bch2_parse_mount_opts(NULL, &thr->opts, optstr); - kfree(optstr); + if (!IS_ERR(optstr)) + kfree(optstr); if (ret) goto err; @@ -319,7 +320,8 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) return ret; ret = bch2_dev_add(c, path); - kfree(path); + if (!IS_ERR(path)) + kfree(path); return ret; } @@ -850,7 +852,8 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c, ret = PTR_ERR_OR_ZERO(optstr) ?: bch2_parse_mount_opts(c, &thr->opts, optstr); - kfree(optstr); + if (!IS_ERR(optstr)) + kfree(optstr); if (ret) goto err; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0d807c2ce9c6..1a0072eef109 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -202,9 +202,8 @@ restart_drop_conflicting_replicas: bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); /* Now, drop excess replicas: */ -restart_drop_extra_replicas: - rcu_read_lock(); +restart_drop_extra_replicas: bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 51cbf3928361..f0d4727c4dc2 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -568,6 +568,32 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; +typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r); + +static void list_sort(struct list_head *head, list_cmp_fn cmp) +{ + struct list_head *pos; + + list_for_each(pos, head) + while (!list_is_last(pos, head) && + cmp(pos, pos->next) > 0) { + struct list_head *pos2, *next = pos->next; + + list_del(next); + list_for_each(pos2, head) + if (cmp(next, pos2) < 0) + goto pos_found; + BUG(); +pos_found: + list_add_tail(next, pos2); + } +} + +static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r) +{ + return cmp_int(l, r); +} + static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -575,41 +601,39 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, struct bch_fs *c = i->c; struct btree_trans *trans; ssize_t ret = 0; - u32 seq; i->ubuf = buf; i->size = size; i->ret = 0; restart: seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct task_struct *task = READ_ONCE(trans->locking_wait.task); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); - if (!task || task->pid <= i->iter) + list_for_each_entry(trans, &c->btree_trans_list, list) { + if ((ulong) trans < i->iter) continue; - closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + i->iter = (ulong) trans; - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto unlocked; - } + if (!closure_get_not_zero(&trans->ref)) + continue; + + u32 seq = seqmutex_unlock(&c->btree_trans_lock); bch2_btree_trans_to_text(&i->buf, trans); prt_printf(&i->buf, "backtrace:\n"); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); + bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = task->pid; - closure_put(&trans->ref); + ret = flush_buf(i); + if (ret) + goto unlocked; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } @@ -804,50 +828,55 @@ static const struct file_operations btree_transaction_stats_op = { .read = btree_transaction_stats_read, }; -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +/* walk btree transactions until we find a deadlock and print it */ +static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) { - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; struct btree_trans *trans; - ssize_t ret = 0; - u32 seq; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (i->iter) - goto out; + pid_t iter = 0; restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { struct task_struct *task = READ_ONCE(trans->locking_wait.task); - if (!task || task->pid <= i->iter) + if (!task || task->pid <= iter) continue; - closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + iter = task->pid; - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto out; - } + if (!closure_get_not_zero(&trans->ref)) + continue; - bch2_check_for_deadlock(trans, &i->buf); + u32 seq = seqmutex_unlock(&c->btree_trans_lock); - i->iter = task->pid; + bool found = bch2_check_for_deadlock(trans, out) != 0; closure_put(&trans->ref); + if (found) + return; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } seqmutex_unlock(&c->btree_trans_lock); -out: +} + +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (!i->iter) { + btree_deadlock_to_text(&i->buf, c); + i->iter++; + } + if (i->buf.allocation_failure) ret = -ENOMEM; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d8b9beca3776..83e279d41829 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -268,6 +268,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, { struct bch_fs *c = trans->c; const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; + struct printbuf buf = PRINTBUF; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); @@ -289,6 +290,13 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + ptr->dev, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -EIO; + goto err_unlock; + } + bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); @@ -297,10 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans, bch2_dev_usage_update(c, ca, &old, &new, 0, true); } bucket_unlock(g); +err_unlock: percpu_up_read(&c->mark_lock); } err: bch2_dev_put(ca); + printbuf_exit(&buf); return ret; } @@ -714,10 +724,12 @@ static void ec_block_endio(struct bio *bio) bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); - if (dev_ptr_stale(ca, ptr)) { + int stale = dev_ptr_stale(ca, ptr); + if (stale) { bch_err_ratelimited(ca->fs, - "error %s stripe: stale pointer after io", - bio_data_dir(bio) == READ ? "reading from" : "writing to"); + "error %s stripe: stale/invalid pointer (%i) after io", + bio_data_dir(bio) == READ ? "reading from" : "writing to", + stale); clear_bit(ec_bio->idx, ec_bio->buf->valid); } @@ -743,10 +755,12 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, return; } - if (dev_ptr_stale(ca, ptr)) { + int stale = dev_ptr_stale(ca, ptr); + if (stale) { bch_err_ratelimited(c, - "error %s stripe: stale pointer", - rw == READ ? "reading from" : "writing to"); + "error %s stripe: stale pointer (%i)", + rw == READ ? "reading from" : "writing to", + stale); clear_bit(idx, buf->valid); return; } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index dbe35b80bc0b..58612abf7927 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -116,6 +116,9 @@ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ + x(EEXIST, EEXIST_str_hash_set) \ + x(EEXIST, EEXIST_discard_in_flight_add) \ + x(EEXIST, EEXIST_subvolume_create) \ x(0, open_buckets_empty) \ x(0, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index c66eeffcd7f2..d95c40f1b6af 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -15,6 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) switch (c->opts.errors) { case BCH_ON_ERROR_continue: return false; + case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", @@ -191,6 +192,12 @@ static void prt_actioning(struct printbuf *out, const char *action) prt_str(out, "ing"); } +static const u8 fsck_flags_extra[] = { +#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags, + BCH_SB_ERRS() +#undef x +}; + int bch2_fsck_err(struct bch_fs *c, enum bch_fsck_flags flags, enum bch_sb_error_id err, @@ -203,6 +210,9 @@ int bch2_fsck_err(struct bch_fs *c, int ret = -BCH_ERR_fsck_ignore; const char *action_orig = "fix?", *action = action_orig; + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + flags |= fsck_flags_extra[err]; + if ((flags & FSCK_CAN_FIX) && test_bit(err, c->sb.errors_silent)) return -BCH_ERR_fsck_fix; @@ -265,7 +275,14 @@ int bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + if ((flags & FSCK_CAN_FIX) && + (flags & FSCK_AUTOFIX) && + (c->opts.errors == BCH_ON_ERROR_continue || + c->opts.errors == BCH_ON_ERROR_fix_safe)) { + prt_str(out, ", "); + prt_actioning(out, action); + ret = -BCH_ERR_fsck_fix; + } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 36caedf72d89..777711504c35 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -108,13 +108,6 @@ struct fsck_err_state { char *last_msg; }; -enum bch_fsck_flags { - FSCK_CAN_FIX = 1 << 0, - FSCK_CAN_IGNORE = 1 << 1, - FSCK_NEED_FSCK = 1 << 2, - FSCK_NO_RATELIMIT = 1 << 3, -}; - #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) __printf(4, 5) __cold diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 469037929685..410b8bd81b5a 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); - if (p.ptr.cached && (!ca || dev_ptr_stale(ca, &p.ptr))) + if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; @@ -999,7 +999,7 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) bch2_bkey_drop_ptrs(k, ptr, ptr->cached && (ca = bch2_dev_rcu(c, ptr->dev)) && - dev_ptr_stale_rcu(ca, ptr)); + dev_ptr_stale_rcu(ca, ptr) > 0); rcu_read_unlock(); return bkey_deleted(k.k); @@ -1024,8 +1024,11 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_str(out, " cached"); if (ptr->unwritten) prt_str(out, " unwritten"); - if (bucket_valid(ca, b) && dev_ptr_stale_rcu(ca, ptr)) + int stale = dev_ptr_stale_rcu(ca, ptr); + if (stale > 0) prt_printf(out, " stale"); + else if (stale) + prt_printf(out, " invalid"); } rcu_read_unlock(); --out->atomic; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 205a323ffc6d..79a0c8732bce 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -308,8 +308,8 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) return ret; } -static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) { struct inode *dir; struct bch_inode_info *inode; @@ -373,7 +373,7 @@ retry: } if (dst_dentry->d_inode) { - error = -EEXIST; + error = -BCH_ERR_EEXIST_subvolume_create; goto err3; } @@ -406,9 +406,12 @@ retry: !arg.src_ptr) snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; + down_write(&c->snapshot_create_lock); inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), dst_dentry, arg.mode|S_IFDIR, 0, snapshot_src, create_flags); + up_write(&c->snapshot_create_lock); + error = PTR_ERR_OR_ZERO(inode); if (error) goto err3; @@ -429,16 +432,6 @@ err1: return error; } -static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) -{ - down_write(&c->snapshot_create_lock); - long ret = __bch2_ioctl_subvolume_create(c, filp, arg); - up_write(&c->snapshot_create_lock); - - return ret; -} - static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, struct bch_ioctl_subvolume arg) { diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index cd388f1702dc..f9c9a95d7d4c 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -188,6 +188,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino BUG_ON(!old); if (unlikely(old != inode)) { + /* + * bcachefs doesn't use I_NEW; we have no use for it since we + * only insert fully created inodes in the inode hash table. But + * discard_new_inode() expects it to be set... + */ + inode->v.i_flags |= I_NEW; discard_new_inode(&inode->v); inode = old; } else { @@ -195,8 +201,10 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); mutex_unlock(&c->vfs_inodes_lock); /* - * we really don't want insert_inode_locked2() to be setting - * I_NEW... + * Again, I_NEW makes no sense for bcachefs. This is only needed + * for clearing I_NEW, but since the inode was already fully + * created and initialized we didn't actually want + * inode_insert5() to set it for us. */ unlock_new_inode(&inode->v); } @@ -227,7 +235,9 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) mutex_init(&inode->ei_update_lock); two_state_lock_init(&inode->ei_pagecache_lock); INIT_LIST_HEAD(&inode->ei_vfs_inode_list); + inode->ei_flags = 0; mutex_init(&inode->ei_quota_lock); + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); inode->v.i_state = 0; if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { @@ -1155,6 +1165,7 @@ static const struct file_operations bch_file_operations = { .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, + .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, @@ -1486,11 +1497,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, bch2_iget5_set(&inode->v, &inum); bch2_inode_update_after_write(trans, inode, bi, ~0); - if (BCH_SUBVOLUME_SNAP(subvol)) - set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - else - clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; @@ -1502,6 +1508,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->ei_qid = bch_qid(bi); inode->ei_subvol = inum.subvol; + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + inode->v.i_mapping->a_ops = &bch_address_space_operations; switch (inode->v.i_mode & S_IFMT) { @@ -1967,6 +1976,7 @@ got_sb: sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); sb->s_uuid = c->sb.user_uuid; + sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index fd277bd58ed3..921bcdb3e5e4 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1677,6 +1677,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) trans_was_restarted(trans, restart_count); } +noinline_for_stack static int check_dirent_inode_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d, @@ -1773,6 +1774,7 @@ out_noiter: return ret; } +noinline_for_stack static int check_dirent_target(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d, @@ -1847,6 +1849,7 @@ found: return ret; } +noinline_for_stack static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d) { diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index f57486794484..c97fa7002b06 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -84,9 +84,10 @@ struct promote_op { }; static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), - .key_len = sizeof(struct bpos), + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), + .automatic_shrinking = true, }; static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, @@ -776,18 +777,32 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, PTR_BUCKET_POS(ca, &ptr), BTREE_ITER_cached); - prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); - printbuf_indent_add(&buf, 2); + u8 *gen = bucket_gen(ca, iter.pos.offset); + if (gen) { - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); + prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); + printbuf_indent_add(&buf, 2); - prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); - - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (!ret) { + bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); + + prt_printf(&buf, "memory gen: %u", *gen); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + } else { + prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", + iter.pos.inode, iter.pos.offset); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "first bucket %u nbuckets %llu\n", + ca->mi.first_bucket, ca->mi.nbuckets); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); } bch2_fs_inconsistent(c, "%s", buf.buf); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 9401d13e31bb..05e0cbef420b 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1220,7 +1220,7 @@ static void bch2_nocow_write(struct bch_write_op *op) DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; u32 snapshot; struct bucket_to_lock *stale_at; - int ret; + int stale, ret; if (op->flags & BCH_WRITE_MOVE) return; @@ -1299,7 +1299,8 @@ retry: BUCKET_NOCOW_LOCK_UPDATE); rcu_read_lock(); - bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen); + u8 *gen = bucket_gen(ca, i->b.offset); + stale = !gen ? -1 : gen_after(*gen, i->gen); rcu_read_unlock(); if (unlikely(stale)) { @@ -1380,8 +1381,18 @@ err_bucket_stale: break; } - /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; + struct printbuf buf = PRINTBUF; + if (bch2_fs_inconsistent_on(stale < 0, c, + "pointer to invalid bucket in nocow path on device %llu\n %s", + stale_at->b.inode, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + } else { + /* We can retry this: */ + ret = -BCH_ERR_transaction_restart; + } + printbuf_exit(&buf); + goto err_get_ioref; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index adec8e1ea73e..13669dd0e375 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1167,6 +1167,9 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) void bch2_fs_journal_stop(struct journal *j) { + if (!test_bit(JOURNAL_running, &j->flags)) + return; + bch2_journal_reclaim_stop(j); bch2_journal_flush_all_pins(j); @@ -1518,6 +1521,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 struct journal_entry_pin *pin; spin_lock(&j->lock); + if (!test_bit(JOURNAL_running, &j->flags)) { + spin_unlock(&j->lock); + return true; + } + *seq = max(*seq, j->pin.front); if (*seq >= j->pin.back) { diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index cdcb1ad49af4..db24ce21b2ac 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1677,6 +1677,13 @@ static CLOSURE_CALLBACK(journal_write_done) mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); } + /* + * We don't typically trigger journal writes from her - the next journal + * write will be triggered immediately after the previous one is + * allocated, in bch2_journal_write() - but the journal write error path + * is special: + */ + bch2_journal_do_writes(j); spin_unlock(&j->lock); } @@ -1967,7 +1974,6 @@ CLOSURE_CALLBACK(bch2_journal_write) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; - struct printbuf journal_debug_buf = PRINTBUF; unsigned nr_rw_members = 0; int ret; @@ -2011,11 +2017,15 @@ CLOSURE_CALLBACK(bch2_journal_write) } if (ret) { - __bch2_journal_debug_to_text(&journal_debug_buf, j); + struct printbuf buf = PRINTBUF; + buf.atomic++; + + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write: %s"), + bch2_err_str(ret)); + __bch2_journal_debug_to_text(&buf, j); spin_unlock(&j->lock); - bch_err(c, "Unable to allocate journal write:\n%s", - journal_debug_buf.buf); - printbuf_exit(&journal_debug_buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); goto err; } diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index ed4846709611..1f25c111c54c 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -232,7 +232,7 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c) BUG_ON(nr != t->nr); unsigned i; - for (src = bl->start, i = eytzinger0_first(t->nr); + for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); src < bl->start + nr; src++, i = eytzinger0_next(i, nr)) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index fb11ab0dd00e..bd71ba77de07 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -2,9 +2,6 @@ #ifndef _BCACHEFS_LRU_H #define _BCACHEFS_LRU_H -#define LRU_TIME_BITS 48 -#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) - static inline u64 lru_pos_id(struct bpos pos) { return pos.inode >> LRU_TIME_BITS; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 8171f947fac8..6e477fadaa2a 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -547,6 +547,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, ctxt->stats->pos = BBPOS(btree_id, start); } + bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, btree_id, start, BTREE_ITER_prefetch| BTREE_ITER_all_snapshots); @@ -920,7 +921,20 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, ? c->opts.metadata_replicas : io_opts->data_replicas; - if (!nr_good || nr_good >= replicas) + rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ptr->cached && + (!ca || !ca->mi.durability)) + data_opts->kill_ptrs |= BIT(i); + i++; + } + rcu_read_unlock(); + + if (!data_opts->kill_ptrs && + (!nr_good || nr_good >= replicas)) return false; data_opts->target = 0; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 10bfb31c151b..eb49dd045eff 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -35,9 +35,10 @@ struct buckets_in_flight { }; static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket_in_flight, hash), - .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), - .key_len = sizeof(struct move_bucket_key), + .head_offset = offsetof(struct move_bucket_in_flight, hash), + .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .key_len = sizeof(struct move_bucket_key), + .automatic_shrinking = true, }; static struct move_bucket_in_flight * diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 25530e0bb2f3..b197ec90d4cb 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -137,7 +137,7 @@ enum fsck_err_opts { x(errors, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index cf513fc79ce4..1f9d044ed920 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -326,6 +326,12 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_btree_root: { struct btree_root *r; + if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, + c, invalid_btree_id, + "invalid btree id %u (max %u)", + entry->btree_id, BTREE_ID_NR_MAX)) + return 0; + while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); if (ret) @@ -415,7 +421,7 @@ static int journal_replay_entry_early(struct bch_fs *c, atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); } } - +fsck_err: return ret; } @@ -658,10 +664,10 @@ int bch2_fs_recovery(struct bch_fs *c) if (check_version_upgrade(c)) write_sb = true; + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (write_sb) bch2_write_super(c); - - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); mutex_unlock(&c->sb_lock); if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 3fb23e399ffb..4710b61631f0 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -228,7 +228,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) dst = (void *) &darray_top(table); dst->version = cpu_to_le16(src->version); - dst->recovery_passes[0] = cpu_to_le64(src->recovery_passes); + dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes)); dst->recovery_passes[1] = 0; dst->nr_errors = cpu_to_le16(src->nr_errors); for (unsigned i = 0; i < src->nr_errors; i++) diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index bda33e59e226..c1270d790e43 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -110,19 +110,25 @@ out: void bch2_sb_errors_from_cpu(struct bch_fs *c) { bch_sb_errors_cpu *src = &c->fsck_error_counts; - struct bch_sb_field_errors *dst = - bch2_sb_field_resize(&c->disk_sb, errors, - bch2_sb_field_errors_u64s(src->nr)); + struct bch_sb_field_errors *dst; unsigned i; + mutex_lock(&c->fsck_error_counts_lock); + + dst = bch2_sb_field_resize(&c->disk_sb, errors, + bch2_sb_field_errors_u64s(src->nr)); + if (!dst) - return; + goto err; for (i = 0; i < src->nr; i++) { SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); } + +err: + mutex_unlock(&c->fsck_error_counts_lock); } static int bch2_sb_errors_to_cpu(struct bch_fs *c) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 84d2763bd597..d6f35a99c429 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -2,281 +2,294 @@ #ifndef _BCACHEFS_SB_ERRORS_FORMAT_H #define _BCACHEFS_SB_ERRORS_FORMAT_H -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0) \ - x(dirty_but_no_journal_entries, 1) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ - x(sb_clean_journal_seq_mismatch, 3) \ - x(sb_clean_btree_root_mismatch, 4) \ - x(sb_clean_missing, 5) \ - x(jset_unsupported_version, 6) \ - x(jset_unknown_csum, 7) \ - x(jset_last_seq_newer_than_seq, 8) \ - x(jset_past_bucket_end, 9) \ - x(jset_seq_blacklisted, 10) \ - x(journal_entries_missing, 11) \ - x(journal_entry_replicas_not_marked, 12) \ - x(journal_entry_past_jset_end, 13) \ - x(journal_entry_replicas_data_mismatch, 14) \ - x(journal_entry_bkey_u64s_0, 15) \ - x(journal_entry_bkey_past_end, 16) \ - x(journal_entry_bkey_bad_format, 17) \ - x(journal_entry_bkey_invalid, 18) \ - x(journal_entry_btree_root_bad_size, 19) \ - x(journal_entry_blacklist_bad_size, 20) \ - x(journal_entry_blacklist_v2_bad_size, 21) \ - x(journal_entry_blacklist_v2_start_past_end, 22) \ - x(journal_entry_usage_bad_size, 23) \ - x(journal_entry_data_usage_bad_size, 24) \ - x(journal_entry_clock_bad_size, 25) \ - x(journal_entry_clock_bad_rw, 26) \ - x(journal_entry_dev_usage_bad_size, 27) \ - x(journal_entry_dev_usage_bad_dev, 28) \ - x(journal_entry_dev_usage_bad_pad, 29) \ - x(btree_node_unreadable, 30) \ - x(btree_node_fault_injected, 31) \ - x(btree_node_bad_magic, 32) \ - x(btree_node_bad_seq, 33) \ - x(btree_node_unsupported_version, 34) \ - x(btree_node_bset_older_than_sb_min, 35) \ - x(btree_node_bset_newer_than_sb, 36) \ - x(btree_node_data_missing, 37) \ - x(btree_node_bset_after_end, 38) \ - x(btree_node_replicas_sectors_written_mismatch, 39) \ - x(btree_node_replicas_data_mismatch, 40) \ - x(bset_unknown_csum, 41) \ - x(bset_bad_csum, 42) \ - x(bset_past_end_of_btree_node, 43) \ - x(bset_wrong_sector_offset, 44) \ - x(bset_empty, 45) \ - x(bset_bad_seq, 46) \ - x(bset_blacklisted_journal_seq, 47) \ - x(first_bset_blacklisted_journal_seq, 48) \ - x(btree_node_bad_btree, 49) \ - x(btree_node_bad_level, 50) \ - x(btree_node_bad_min_key, 51) \ - x(btree_node_bad_max_key, 52) \ - x(btree_node_bad_format, 53) \ - x(btree_node_bkey_past_bset_end, 54) \ - x(btree_node_bkey_bad_format, 55) \ - x(btree_node_bad_bkey, 56) \ - x(btree_node_bkey_out_of_order, 57) \ - x(btree_root_bkey_invalid, 58) \ - x(btree_root_read_error, 59) \ - x(btree_root_bad_min_key, 60) \ - x(btree_root_bad_max_key, 61) \ - x(btree_node_read_error, 62) \ - x(btree_node_topology_bad_min_key, 63) \ - x(btree_node_topology_bad_max_key, 64) \ - x(btree_node_topology_overwritten_by_prev_node, 65) \ - x(btree_node_topology_overwritten_by_next_node, 66) \ - x(btree_node_topology_interior_node_empty, 67) \ - x(fs_usage_hidden_wrong, 68) \ - x(fs_usage_btree_wrong, 69) \ - x(fs_usage_data_wrong, 70) \ - x(fs_usage_cached_wrong, 71) \ - x(fs_usage_reserved_wrong, 72) \ - x(fs_usage_persistent_reserved_wrong, 73) \ - x(fs_usage_nr_inodes_wrong, 74) \ - x(fs_usage_replicas_wrong, 75) \ - x(dev_usage_buckets_wrong, 76) \ - x(dev_usage_sectors_wrong, 77) \ - x(dev_usage_fragmented_wrong, 78) \ - x(dev_usage_buckets_ec_wrong, 79) \ - x(bkey_version_in_future, 80) \ - x(bkey_u64s_too_small, 81) \ - x(bkey_invalid_type_for_btree, 82) \ - x(bkey_extent_size_zero, 83) \ - x(bkey_extent_size_greater_than_offset, 84) \ - x(bkey_size_nonzero, 85) \ - x(bkey_snapshot_nonzero, 86) \ - x(bkey_snapshot_zero, 87) \ - x(bkey_at_pos_max, 88) \ - x(bkey_before_start_of_btree_node, 89) \ - x(bkey_after_end_of_btree_node, 90) \ - x(bkey_val_size_nonzero, 91) \ - x(bkey_val_size_too_small, 92) \ - x(alloc_v1_val_size_bad, 93) \ - x(alloc_v2_unpack_error, 94) \ - x(alloc_v3_unpack_error, 95) \ - x(alloc_v4_val_size_bad, 96) \ - x(alloc_v4_backpointers_start_bad, 97) \ - x(alloc_key_data_type_bad, 98) \ - x(alloc_key_empty_but_have_data, 99) \ - x(alloc_key_dirty_sectors_0, 100) \ - x(alloc_key_data_type_inconsistency, 101) \ - x(alloc_key_to_missing_dev_bucket, 102) \ - x(alloc_key_cached_inconsistency, 103) \ - x(alloc_key_cached_but_read_time_zero, 104) \ - x(alloc_key_to_missing_lru_entry, 105) \ - x(alloc_key_data_type_wrong, 106) \ - x(alloc_key_gen_wrong, 107) \ - x(alloc_key_dirty_sectors_wrong, 108) \ - x(alloc_key_cached_sectors_wrong, 109) \ - x(alloc_key_stripe_wrong, 110) \ - x(alloc_key_stripe_redundancy_wrong, 111) \ - x(bucket_sector_count_overflow, 112) \ - x(bucket_metadata_type_mismatch, 113) \ - x(need_discard_key_wrong, 114) \ - x(freespace_key_wrong, 115) \ - x(freespace_hole_missing, 116) \ - x(bucket_gens_val_size_bad, 117) \ - x(bucket_gens_key_wrong, 118) \ - x(bucket_gens_hole_wrong, 119) \ - x(bucket_gens_to_invalid_dev, 120) \ - x(bucket_gens_to_invalid_buckets, 121) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ - x(need_discard_freespace_key_bad, 124) \ - x(backpointer_bucket_offset_wrong, 125) \ - x(backpointer_to_missing_device, 126) \ - x(backpointer_to_missing_alloc, 127) \ - x(backpointer_to_missing_ptr, 128) \ - x(lru_entry_at_time_0, 129) \ - x(lru_entry_to_invalid_bucket, 130) \ - x(lru_entry_bad, 131) \ - x(btree_ptr_val_too_big, 132) \ - x(btree_ptr_v2_val_too_big, 133) \ - x(btree_ptr_has_non_ptr, 134) \ - x(extent_ptrs_invalid_entry, 135) \ - x(extent_ptrs_no_ptrs, 136) \ - x(extent_ptrs_too_many_ptrs, 137) \ - x(extent_ptrs_redundant_crc, 138) \ - x(extent_ptrs_redundant_stripe, 139) \ - x(extent_ptrs_unwritten, 140) \ - x(extent_ptrs_written_and_unwritten, 141) \ - x(ptr_to_invalid_device, 142) \ - x(ptr_to_duplicate_device, 143) \ - x(ptr_after_last_bucket, 144) \ - x(ptr_before_first_bucket, 145) \ - x(ptr_spans_multiple_buckets, 146) \ - x(ptr_to_missing_backpointer, 147) \ - x(ptr_to_missing_alloc_key, 148) \ - x(ptr_to_missing_replicas_entry, 149) \ - x(ptr_to_missing_stripe, 150) \ - x(ptr_to_incorrect_stripe, 151) \ - x(ptr_gen_newer_than_bucket_gen, 152) \ - x(ptr_too_stale, 153) \ - x(stale_dirty_ptr, 154) \ - x(ptr_bucket_data_type_mismatch, 155) \ - x(ptr_cached_and_erasure_coded, 156) \ - x(ptr_crc_uncompressed_size_too_small, 157) \ - x(ptr_crc_csum_type_unknown, 158) \ - x(ptr_crc_compression_type_unknown, 159) \ - x(ptr_crc_redundant, 160) \ - x(ptr_crc_uncompressed_size_too_big, 161) \ - x(ptr_crc_nonce_mismatch, 162) \ - x(ptr_stripe_redundant, 163) \ - x(reservation_key_nr_replicas_invalid, 164) \ - x(reflink_v_refcount_wrong, 165) \ - x(reflink_p_to_missing_reflink_v, 166) \ - x(stripe_pos_bad, 167) \ - x(stripe_val_size_bad, 168) \ - x(stripe_sector_count_wrong, 169) \ - x(snapshot_tree_pos_bad, 170) \ - x(snapshot_tree_to_missing_snapshot, 171) \ - x(snapshot_tree_to_missing_subvol, 172) \ - x(snapshot_tree_to_wrong_subvol, 173) \ - x(snapshot_tree_to_snapshot_subvol, 174) \ - x(snapshot_pos_bad, 175) \ - x(snapshot_parent_bad, 176) \ - x(snapshot_children_not_normalized, 177) \ - x(snapshot_child_duplicate, 178) \ - x(snapshot_child_bad, 179) \ - x(snapshot_skiplist_not_normalized, 180) \ - x(snapshot_skiplist_bad, 181) \ - x(snapshot_should_not_have_subvol, 182) \ - x(snapshot_to_bad_snapshot_tree, 183) \ - x(snapshot_bad_depth, 184) \ - x(snapshot_bad_skiplist, 185) \ - x(subvol_pos_bad, 186) \ - x(subvol_not_master_and_not_snapshot, 187) \ - x(subvol_to_missing_root, 188) \ - x(subvol_root_wrong_bi_subvol, 189) \ - x(bkey_in_missing_snapshot, 190) \ - x(inode_pos_inode_nonzero, 191) \ - x(inode_pos_blockdev_range, 192) \ - x(inode_unpack_error, 193) \ - x(inode_str_hash_invalid, 194) \ - x(inode_v3_fields_start_bad, 195) \ - x(inode_snapshot_mismatch, 196) \ - x(inode_unlinked_but_clean, 197) \ - x(inode_unlinked_but_nlink_nonzero, 198) \ - x(inode_checksum_type_invalid, 199) \ - x(inode_compression_type_invalid, 200) \ - x(inode_subvol_root_but_not_dir, 201) \ - x(inode_i_size_dirty_but_clean, 202) \ - x(inode_i_sectors_dirty_but_clean, 203) \ - x(inode_i_sectors_wrong, 204) \ - x(inode_dir_wrong_nlink, 205) \ - x(inode_dir_multiple_links, 206) \ - x(inode_multiple_links_but_nlink_0, 207) \ - x(inode_wrong_backpointer, 208) \ - x(inode_wrong_nlink, 209) \ - x(inode_unreachable, 210) \ - x(deleted_inode_but_clean, 211) \ - x(deleted_inode_missing, 212) \ - x(deleted_inode_is_dir, 213) \ - x(deleted_inode_not_unlinked, 214) \ - x(extent_overlapping, 215) \ - x(extent_in_missing_inode, 216) \ - x(extent_in_non_reg_inode, 217) \ - x(extent_past_end_of_inode, 218) \ - x(dirent_empty_name, 219) \ - x(dirent_val_too_big, 220) \ - x(dirent_name_too_long, 221) \ - x(dirent_name_embedded_nul, 222) \ - x(dirent_name_dot_or_dotdot, 223) \ - x(dirent_name_has_slash, 224) \ - x(dirent_d_type_wrong, 225) \ - x(inode_bi_parent_wrong, 226) \ - x(dirent_in_missing_dir_inode, 227) \ - x(dirent_in_non_dir_inode, 228) \ - x(dirent_to_missing_inode, 229) \ - x(dirent_to_missing_subvol, 230) \ - x(dirent_to_itself, 231) \ - x(quota_type_invalid, 232) \ - x(xattr_val_size_too_small, 233) \ - x(xattr_val_size_too_big, 234) \ - x(xattr_invalid_type, 235) \ - x(xattr_name_invalid_chars, 236) \ - x(xattr_in_missing_inode, 237) \ - x(root_subvol_missing, 238) \ - x(root_dir_missing, 239) \ - x(root_inode_not_dir, 240) \ - x(dir_loop, 241) \ - x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) \ - x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) \ - x(journal_entry_dup_same_device, 246) \ - x(inode_bi_subvol_missing, 247) \ - x(inode_bi_subvol_wrong, 248) \ - x(inode_points_to_missing_dirent, 249) \ - x(inode_points_to_wrong_dirent, 250) \ - x(inode_bi_parent_nonzero, 251) \ - x(dirent_to_missing_parent_subvol, 252) \ - x(dirent_not_visible_in_parent_subvol, 253) \ - x(subvol_fs_path_parent_wrong, 254) \ - x(subvol_root_fs_path_parent_nonzero, 255) \ - x(subvol_children_not_set, 256) \ - x(subvol_children_bad, 257) \ - x(subvol_loop, 258) \ - x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) \ - x(btree_node_topology_empty_interior_node, 261) \ - x(btree_ptr_v2_min_key_bad, 262) \ - x(btree_root_unreadable_and_scan_found_nothing, 263) \ - x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) \ - x(btree_bitmap_not_marked, 266) \ - x(sb_clean_entry_overrun, 267) \ - x(btree_ptr_v2_written_0, 268) \ - x(subvol_snapshot_bad, 269) \ - x(subvol_inode_bad, 270) +enum bch_fsck_flags { + FSCK_CAN_FIX = 1 << 0, + FSCK_CAN_IGNORE = 1 << 1, + FSCK_NEED_FSCK = 1 << 2, + FSCK_NO_RATELIMIT = 1 << 3, + FSCK_AUTOFIX = 1 << 4, +}; + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0, 0) \ + x(dirty_but_no_journal_entries, 1, 0) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \ + x(sb_clean_journal_seq_mismatch, 3, 0) \ + x(sb_clean_btree_root_mismatch, 4, 0) \ + x(sb_clean_missing, 5, 0) \ + x(jset_unsupported_version, 6, 0) \ + x(jset_unknown_csum, 7, 0) \ + x(jset_last_seq_newer_than_seq, 8, 0) \ + x(jset_past_bucket_end, 9, 0) \ + x(jset_seq_blacklisted, 10, 0) \ + x(journal_entries_missing, 11, 0) \ + x(journal_entry_replicas_not_marked, 12, 0) \ + x(journal_entry_past_jset_end, 13, 0) \ + x(journal_entry_replicas_data_mismatch, 14, 0) \ + x(journal_entry_bkey_u64s_0, 15, 0) \ + x(journal_entry_bkey_past_end, 16, 0) \ + x(journal_entry_bkey_bad_format, 17, 0) \ + x(journal_entry_bkey_invalid, 18, 0) \ + x(journal_entry_btree_root_bad_size, 19, 0) \ + x(journal_entry_blacklist_bad_size, 20, 0) \ + x(journal_entry_blacklist_v2_bad_size, 21, 0) \ + x(journal_entry_blacklist_v2_start_past_end, 22, 0) \ + x(journal_entry_usage_bad_size, 23, 0) \ + x(journal_entry_data_usage_bad_size, 24, 0) \ + x(journal_entry_clock_bad_size, 25, 0) \ + x(journal_entry_clock_bad_rw, 26, 0) \ + x(journal_entry_dev_usage_bad_size, 27, 0) \ + x(journal_entry_dev_usage_bad_dev, 28, 0) \ + x(journal_entry_dev_usage_bad_pad, 29, 0) \ + x(btree_node_unreadable, 30, 0) \ + x(btree_node_fault_injected, 31, 0) \ + x(btree_node_bad_magic, 32, 0) \ + x(btree_node_bad_seq, 33, 0) \ + x(btree_node_unsupported_version, 34, 0) \ + x(btree_node_bset_older_than_sb_min, 35, 0) \ + x(btree_node_bset_newer_than_sb, 36, 0) \ + x(btree_node_data_missing, 37, 0) \ + x(btree_node_bset_after_end, 38, 0) \ + x(btree_node_replicas_sectors_written_mismatch, 39, 0) \ + x(btree_node_replicas_data_mismatch, 40, 0) \ + x(bset_unknown_csum, 41, 0) \ + x(bset_bad_csum, 42, 0) \ + x(bset_past_end_of_btree_node, 43, 0) \ + x(bset_wrong_sector_offset, 44, 0) \ + x(bset_empty, 45, 0) \ + x(bset_bad_seq, 46, 0) \ + x(bset_blacklisted_journal_seq, 47, 0) \ + x(first_bset_blacklisted_journal_seq, 48, 0) \ + x(btree_node_bad_btree, 49, 0) \ + x(btree_node_bad_level, 50, 0) \ + x(btree_node_bad_min_key, 51, 0) \ + x(btree_node_bad_max_key, 52, 0) \ + x(btree_node_bad_format, 53, 0) \ + x(btree_node_bkey_past_bset_end, 54, 0) \ + x(btree_node_bkey_bad_format, 55, 0) \ + x(btree_node_bad_bkey, 56, 0) \ + x(btree_node_bkey_out_of_order, 57, 0) \ + x(btree_root_bkey_invalid, 58, 0) \ + x(btree_root_read_error, 59, 0) \ + x(btree_root_bad_min_key, 60, 0) \ + x(btree_root_bad_max_key, 61, 0) \ + x(btree_node_read_error, 62, 0) \ + x(btree_node_topology_bad_min_key, 63, 0) \ + x(btree_node_topology_bad_max_key, 64, 0) \ + x(btree_node_topology_overwritten_by_prev_node, 65, 0) \ + x(btree_node_topology_overwritten_by_next_node, 66, 0) \ + x(btree_node_topology_interior_node_empty, 67, 0) \ + x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ + x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ + x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ + x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \ + x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \ + x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \ + x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \ + x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \ + x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \ + x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \ + x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \ + x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \ + x(bkey_version_in_future, 80, 0) \ + x(bkey_u64s_too_small, 81, 0) \ + x(bkey_invalid_type_for_btree, 82, 0) \ + x(bkey_extent_size_zero, 83, 0) \ + x(bkey_extent_size_greater_than_offset, 84, 0) \ + x(bkey_size_nonzero, 85, 0) \ + x(bkey_snapshot_nonzero, 86, 0) \ + x(bkey_snapshot_zero, 87, 0) \ + x(bkey_at_pos_max, 88, 0) \ + x(bkey_before_start_of_btree_node, 89, 0) \ + x(bkey_after_end_of_btree_node, 90, 0) \ + x(bkey_val_size_nonzero, 91, 0) \ + x(bkey_val_size_too_small, 92, 0) \ + x(alloc_v1_val_size_bad, 93, 0) \ + x(alloc_v2_unpack_error, 94, 0) \ + x(alloc_v3_unpack_error, 95, 0) \ + x(alloc_v4_val_size_bad, 96, 0) \ + x(alloc_v4_backpointers_start_bad, 97, 0) \ + x(alloc_key_data_type_bad, 98, 0) \ + x(alloc_key_empty_but_have_data, 99, 0) \ + x(alloc_key_dirty_sectors_0, 100, 0) \ + x(alloc_key_data_type_inconsistency, 101, 0) \ + x(alloc_key_to_missing_dev_bucket, 102, 0) \ + x(alloc_key_cached_inconsistency, 103, 0) \ + x(alloc_key_cached_but_read_time_zero, 104, 0) \ + x(alloc_key_to_missing_lru_entry, 105, 0) \ + x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \ + x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \ + x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \ + x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ + x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ + x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ + x(bucket_sector_count_overflow, 112, 0) \ + x(bucket_metadata_type_mismatch, 113, 0) \ + x(need_discard_key_wrong, 114, 0) \ + x(freespace_key_wrong, 115, 0) \ + x(freespace_hole_missing, 116, 0) \ + x(bucket_gens_val_size_bad, 117, 0) \ + x(bucket_gens_key_wrong, 118, 0) \ + x(bucket_gens_hole_wrong, 119, 0) \ + x(bucket_gens_to_invalid_dev, 120, 0) \ + x(bucket_gens_to_invalid_buckets, 121, 0) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122, 0) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ + x(need_discard_freespace_key_bad, 124, 0) \ + x(backpointer_bucket_offset_wrong, 125, 0) \ + x(backpointer_to_missing_device, 126, 0) \ + x(backpointer_to_missing_alloc, 127, 0) \ + x(backpointer_to_missing_ptr, 128, 0) \ + x(lru_entry_at_time_0, 129, 0) \ + x(lru_entry_to_invalid_bucket, 130, 0) \ + x(lru_entry_bad, 131, 0) \ + x(btree_ptr_val_too_big, 132, 0) \ + x(btree_ptr_v2_val_too_big, 133, 0) \ + x(btree_ptr_has_non_ptr, 134, 0) \ + x(extent_ptrs_invalid_entry, 135, 0) \ + x(extent_ptrs_no_ptrs, 136, 0) \ + x(extent_ptrs_too_many_ptrs, 137, 0) \ + x(extent_ptrs_redundant_crc, 138, 0) \ + x(extent_ptrs_redundant_stripe, 139, 0) \ + x(extent_ptrs_unwritten, 140, 0) \ + x(extent_ptrs_written_and_unwritten, 141, 0) \ + x(ptr_to_invalid_device, 142, 0) \ + x(ptr_to_duplicate_device, 143, 0) \ + x(ptr_after_last_bucket, 144, 0) \ + x(ptr_before_first_bucket, 145, 0) \ + x(ptr_spans_multiple_buckets, 146, 0) \ + x(ptr_to_missing_backpointer, 147, 0) \ + x(ptr_to_missing_alloc_key, 148, 0) \ + x(ptr_to_missing_replicas_entry, 149, 0) \ + x(ptr_to_missing_stripe, 150, 0) \ + x(ptr_to_incorrect_stripe, 151, 0) \ + x(ptr_gen_newer_than_bucket_gen, 152, 0) \ + x(ptr_too_stale, 153, 0) \ + x(stale_dirty_ptr, 154, 0) \ + x(ptr_bucket_data_type_mismatch, 155, 0) \ + x(ptr_cached_and_erasure_coded, 156, 0) \ + x(ptr_crc_uncompressed_size_too_small, 157, 0) \ + x(ptr_crc_csum_type_unknown, 158, 0) \ + x(ptr_crc_compression_type_unknown, 159, 0) \ + x(ptr_crc_redundant, 160, 0) \ + x(ptr_crc_uncompressed_size_too_big, 161, 0) \ + x(ptr_crc_nonce_mismatch, 162, 0) \ + x(ptr_stripe_redundant, 163, 0) \ + x(reservation_key_nr_replicas_invalid, 164, 0) \ + x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(stripe_pos_bad, 167, 0) \ + x(stripe_val_size_bad, 168, 0) \ + x(stripe_sector_count_wrong, 169, 0) \ + x(snapshot_tree_pos_bad, 170, 0) \ + x(snapshot_tree_to_missing_snapshot, 171, 0) \ + x(snapshot_tree_to_missing_subvol, 172, 0) \ + x(snapshot_tree_to_wrong_subvol, 173, 0) \ + x(snapshot_tree_to_snapshot_subvol, 174, 0) \ + x(snapshot_pos_bad, 175, 0) \ + x(snapshot_parent_bad, 176, 0) \ + x(snapshot_children_not_normalized, 177, 0) \ + x(snapshot_child_duplicate, 178, 0) \ + x(snapshot_child_bad, 179, 0) \ + x(snapshot_skiplist_not_normalized, 180, 0) \ + x(snapshot_skiplist_bad, 181, 0) \ + x(snapshot_should_not_have_subvol, 182, 0) \ + x(snapshot_to_bad_snapshot_tree, 183, 0) \ + x(snapshot_bad_depth, 184, 0) \ + x(snapshot_bad_skiplist, 185, 0) \ + x(subvol_pos_bad, 186, 0) \ + x(subvol_not_master_and_not_snapshot, 187, 0) \ + x(subvol_to_missing_root, 188, 0) \ + x(subvol_root_wrong_bi_subvol, 189, 0) \ + x(bkey_in_missing_snapshot, 190, 0) \ + x(inode_pos_inode_nonzero, 191, 0) \ + x(inode_pos_blockdev_range, 192, 0) \ + x(inode_unpack_error, 193, 0) \ + x(inode_str_hash_invalid, 194, 0) \ + x(inode_v3_fields_start_bad, 195, 0) \ + x(inode_snapshot_mismatch, 196, 0) \ + x(inode_unlinked_but_clean, 197, 0) \ + x(inode_unlinked_but_nlink_nonzero, 198, 0) \ + x(inode_checksum_type_invalid, 199, 0) \ + x(inode_compression_type_invalid, 200, 0) \ + x(inode_subvol_root_but_not_dir, 201, 0) \ + x(inode_i_size_dirty_but_clean, 202, 0) \ + x(inode_i_sectors_dirty_but_clean, 203, 0) \ + x(inode_i_sectors_wrong, 204, 0) \ + x(inode_dir_wrong_nlink, 205, 0) \ + x(inode_dir_multiple_links, 206, 0) \ + x(inode_multiple_links_but_nlink_0, 207, 0) \ + x(inode_wrong_backpointer, 208, 0) \ + x(inode_wrong_nlink, 209, 0) \ + x(inode_unreachable, 210, 0) \ + x(deleted_inode_but_clean, 211, 0) \ + x(deleted_inode_missing, 212, 0) \ + x(deleted_inode_is_dir, 213, 0) \ + x(deleted_inode_not_unlinked, 214, 0) \ + x(extent_overlapping, 215, 0) \ + x(extent_in_missing_inode, 216, 0) \ + x(extent_in_non_reg_inode, 217, 0) \ + x(extent_past_end_of_inode, 218, 0) \ + x(dirent_empty_name, 219, 0) \ + x(dirent_val_too_big, 220, 0) \ + x(dirent_name_too_long, 221, 0) \ + x(dirent_name_embedded_nul, 222, 0) \ + x(dirent_name_dot_or_dotdot, 223, 0) \ + x(dirent_name_has_slash, 224, 0) \ + x(dirent_d_type_wrong, 225, 0) \ + x(inode_bi_parent_wrong, 226, 0) \ + x(dirent_in_missing_dir_inode, 227, 0) \ + x(dirent_in_non_dir_inode, 228, 0) \ + x(dirent_to_missing_inode, 229, 0) \ + x(dirent_to_missing_subvol, 230, 0) \ + x(dirent_to_itself, 231, 0) \ + x(quota_type_invalid, 232, 0) \ + x(xattr_val_size_too_small, 233, 0) \ + x(xattr_val_size_too_big, 234, 0) \ + x(xattr_invalid_type, 235, 0) \ + x(xattr_name_invalid_chars, 236, 0) \ + x(xattr_in_missing_inode, 237, 0) \ + x(root_subvol_missing, 238, 0) \ + x(root_dir_missing, 239, 0) \ + x(root_inode_not_dir, 240, 0) \ + x(dir_loop, 241, 0) \ + x(hash_table_key_duplicate, 242, 0) \ + x(hash_table_key_wrong_offset, 243, 0) \ + x(unlinked_inode_not_on_deleted_list, 244, 0) \ + x(reflink_p_front_pad_bad, 245, 0) \ + x(journal_entry_dup_same_device, 246, 0) \ + x(inode_bi_subvol_missing, 247, 0) \ + x(inode_bi_subvol_wrong, 248, 0) \ + x(inode_points_to_missing_dirent, 249, 0) \ + x(inode_points_to_wrong_dirent, 250, 0) \ + x(inode_bi_parent_nonzero, 251, 0) \ + x(dirent_to_missing_parent_subvol, 252, 0) \ + x(dirent_not_visible_in_parent_subvol, 253, 0) \ + x(subvol_fs_path_parent_wrong, 254, 0) \ + x(subvol_root_fs_path_parent_nonzero, 255, 0) \ + x(subvol_children_not_set, 256, 0) \ + x(subvol_children_bad, 257, 0) \ + x(subvol_loop, 258, 0) \ + x(subvol_unreachable, 259, 0) \ + x(btree_node_bkey_bad_u64s, 260, 0) \ + x(btree_node_topology_empty_interior_node, 261, 0) \ + x(btree_ptr_v2_min_key_bad, 262, 0) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ + x(snapshot_node_missing, 264, 0) \ + x(dup_backpointer_to_bad_csum_extent, 265, 0) \ + x(btree_bitmap_not_marked, 266, 0) \ + x(sb_clean_entry_overrun, 267, 0) \ + x(btree_ptr_v2_written_0, 268, 0) \ + x(subvol_snapshot_bad, 269, 0) \ + x(subvol_inode_bad, 270, 0) \ + x(alloc_key_stripe_sectors_wrong, 271, 0) \ + x(accounting_mismatch, 272, 0) \ + x(accounting_replicas_not_marked, 273, 0) \ + x(invalid_btree_id, 274, 0) \ + x(alloc_key_io_time_bad, 275, 0) enum bch_sb_error_id { -#define x(t, n) BCH_FSCK_ERR_##t = n, +#define x(t, n, ...) BCH_FSCK_ERR_##t = n, BCH_SB_ERRS() #undef x BCH_SB_ERR_MAX diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h index c1860d8163fb..c4b3d8d3f414 100644 --- a/fs/bcachefs/seqmutex.h +++ b/fs/bcachefs/seqmutex.h @@ -19,17 +19,14 @@ static inline bool seqmutex_trylock(struct seqmutex *lock) static inline void seqmutex_lock(struct seqmutex *lock) { mutex_lock(&lock->lock); -} - -static inline void seqmutex_unlock(struct seqmutex *lock) -{ lock->seq++; - mutex_unlock(&lock->lock); } -static inline u32 seqmutex_seq(struct seqmutex *lock) +static inline u32 seqmutex_unlock(struct seqmutex *lock) { - return lock->seq; + u32 seq = lock->seq; + mutex_unlock(&lock->lock); + return seq; } static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 51918acfd726..24023d6a9698 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -168,6 +168,9 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); + if (unlikely(new_bytes > INT_MAX)) + return NULL; + new = kvzalloc(new_bytes, GFP_KERNEL); if (!new) return NULL; @@ -1565,13 +1568,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - if (!test_bit(BCH_FS_started, &c->flags)) { - ret = bch2_fs_read_write_early(c); - bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); - if (ret) - return ret; - } - trans = bch2_trans_get(c); /* @@ -1687,6 +1683,8 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); + bch2_delete_dead_snapshots(c); bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index cbad9b27874f..c8c266cb5797 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -300,7 +300,7 @@ not_found: if (!found && (flags & STR_HASH_must_replace)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; } else if (found && (flags & STR_HASH_must_create)) { - ret = -EEXIST; + ret = -BCH_ERR_EEXIST_str_hash_set; } else { if (!found && slot.path) swap(iter, slot); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index d73a0222f709..b156fc85b8a3 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -649,9 +649,10 @@ reread: bytes = vstruct_bytes(sb->sb); - if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) { - prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", - bytes, 512UL << sb->sb->layout.sb_max_size_bits); + u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits); + if (bytes > sb_size) { + prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)", + bytes, sb_size); return -BCH_ERR_invalid_sb_too_big; } @@ -1310,15 +1311,15 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "Device index:\t%u\n", sb->dev_idx); - prt_str(out, "Label:\t"); + prt_printf(out, "Label:\t"); prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); prt_newline(out); - prt_str(out, "Version:\t"); + prt_printf(out, "Version:\t"); bch2_version_to_text(out, le16_to_cpu(sb->version)); prt_newline(out); - prt_str(out, "Version upgrade complete:\t"); + prt_printf(out, "Version upgrade complete:\t"); bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); prt_newline(out); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index df2bea38e83f..fb906467201e 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -536,7 +536,6 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); - bch2_fs_allocator_background_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); @@ -582,8 +581,10 @@ static void __bch2_fs_free(struct bch_fs *c) if (c->write_ref_wq) destroy_workqueue(c->write_ref_wq); - if (c->io_complete_wq) - destroy_workqueue(c->io_complete_wq); + if (c->btree_write_submit_wq) + destroy_workqueue(c->btree_write_submit_wq); + if (c->btree_read_complete_wq) + destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->btree_io_complete_wq) @@ -878,8 +879,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->io_complete_wq = alloc_workqueue("bcachefs_io", + !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || + !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG @@ -908,9 +911,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: bch2_fs_replicas_init(c) ?: + bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: @@ -927,12 +930,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - for (i = 0; i < c->sb.nr_devices; i++) - if (bch2_member_exists(c->disk_sb.sb, i) && - bch2_dev_alloc(c, i)) { - ret = -EEXIST; + for (i = 0; i < c->sb.nr_devices; i++) { + if (!bch2_member_exists(c->disk_sb.sb, i)) + continue; + ret = bch2_dev_alloc(c, i); + if (ret) goto err; - } + } bch2_journal_entry_res_resize(&c->journal, &c->btree_root_journal_res, @@ -1190,6 +1194,7 @@ static void bch2_dev_free(struct bch_dev *ca) kfree(ca->buckets_nouse); bch2_free_super(&ca->disk_sb); + bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); free_percpu(ca->io_done); @@ -1312,6 +1317,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, atomic_long_set(&ca->ref, 1); #endif + bch2_dev_allocator_background_init(ca); + if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || @@ -1524,6 +1531,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) * The allocator thread itself allocates btree nodes, so stop it first: */ bch2_dev_allocator_remove(c, ca); + bch2_recalc_capacity(c); bch2_dev_journal_stop(&c->journal, ca); } @@ -1535,6 +1543,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + bch2_dev_do_discards(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 477f350a8bd0..e3a57196b0ee 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -741,7 +741,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) goto fail_put_bio; - } else if (use_append) { + } else if (use_append || + (btrfs_is_zoned(fs_info) && inode && + inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); if (ret) goto fail_put_bio; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 1e09aeea69c2..60066822b532 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1785,6 +1785,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; + LIST_HEAD(retry_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1921,8 +1922,20 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } next: - if (ret) - btrfs_mark_bg_to_reclaim(bg); + if (ret) { + /* Refcount held by the reclaim_bgs list after splice. */ + spin_lock(&fs_info->unused_bgs_lock); + /* + * This block group might be added to the unused list + * during the above process. Move it back to the + * reclaim list otherwise. + */ + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, &retry_list); + } + spin_unlock(&fs_info->unused_bgs_lock); + } btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -1942,6 +1955,9 @@ next: spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); end: + spin_lock(&fs_info->unused_bgs_lock); + list_splice_tail(&retry_list, &fs_info->reclaim_bgs); + spin_unlock(&fs_info->unused_bgs_lock); btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 91c994b569f3..6ed495ca7a31 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -89,6 +89,16 @@ enum { BTRFS_INODE_FREE_SPACE_INODE, /* Set when there are no capabilities in XATTs for the inode. */ BTRFS_INODE_NO_CAP_XATTR, + /* + * Set if an error happened when doing a COW write before submitting a + * bio or during writeback. Used for both buffered writes and direct IO + * writes. This is to signal a fast fsync that it has to wait for + * ordered extents to complete and therefore not log extent maps that + * point to unwritten extents (when an ordered extent completes and it + * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its + * range). + */ + BTRFS_INODE_COW_WRITE_ERROR, }; /* in memory btrfs inode */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1b20b3e390df..38cdb8875e8e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4538,18 +4538,10 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) { struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_delayed_ref_node *ref; - delayed_refs = &trans->delayed_refs; - spin_lock(&delayed_refs->lock); - if (atomic_read(&delayed_refs->num_entries) == 0) { - spin_unlock(&delayed_refs->lock); - btrfs_debug(fs_info, "delayed_refs has NO entry"); - return; - } - while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; struct rb_node *n; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 597387e9f040..958155cc43a8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3553,7 +3553,7 @@ err: for (int i = 0; i < num_folios; i++) { if (eb->folios[i]) { detach_extent_buffer_folio(eb, eb->folios[i]); - __folio_put(eb->folios[i]); + folio_put(eb->folios[i]); } } __free_extent_buffer(eb); @@ -3689,6 +3689,8 @@ static struct extent_buffer *grab_extent_buffer( struct folio *folio = page_folio(page); struct extent_buffer *exists; + lockdep_assert_held(&page->mapping->i_private_lock); + /* * For subpage case, we completely rely on radix tree to ensure we * don't try to insert two ebs for the same bytenr. So here we always @@ -3756,13 +3758,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) * The caller needs to free the existing folios and retry using the same order. */ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, + struct btrfs_subpage *prealloc, struct extent_buffer **found_eb_ret) { struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; const unsigned long index = eb->start >> PAGE_SHIFT; - struct folio *existing_folio; + struct folio *existing_folio = NULL; int ret; ASSERT(found_eb_ret); @@ -3774,12 +3777,14 @@ retry: ret = filemap_add_folio(mapping, eb->folios[i], index + i, GFP_NOFS | __GFP_NOFAIL); if (!ret) - return 0; + goto finish; existing_folio = filemap_lock_folio(mapping, index + i); /* The page cache only exists for a very short time, just retry. */ - if (IS_ERR(existing_folio)) + if (IS_ERR(existing_folio)) { + existing_folio = NULL; goto retry; + } /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); @@ -3790,14 +3795,13 @@ retry: return -EAGAIN; } - if (fs_info->nodesize < PAGE_SIZE) { - /* - * We're going to reuse the existing page, can drop our page - * and subpage structure now. - */ +finish: + spin_lock(&mapping->i_private_lock); + if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; - } else { + } else if (existing_folio) { struct extent_buffer *existing_eb; existing_eb = grab_extent_buffer(fs_info, @@ -3805,6 +3809,7 @@ retry: if (existing_eb) { /* The extent buffer still exists, we can use it directly. */ *found_eb_ret = existing_eb; + spin_unlock(&mapping->i_private_lock); folio_unlock(existing_folio); folio_put(existing_folio); return 1; @@ -3813,6 +3818,22 @@ retry: __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; } + eb->folio_size = folio_size(eb->folios[i]); + eb->folio_shift = folio_shift(eb->folios[i]); + /* Should not fail, as we have preallocated the memory. */ + ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); + ASSERT(!ret); + /* + * To inform we have an extra eb under allocation, so that + * detach_extent_buffer_page() won't release the folio private when the + * eb hasn't been inserted into radix tree yet. + * + * The ref will be decreased when the eb releases the page, in + * detach_extent_buffer_page(). Thus needs no special handling in the + * error path. + */ + btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); + spin_unlock(&mapping->i_private_lock); return 0; } @@ -3824,7 +3845,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; - struct address_space *mapping = fs_info->btree_inode->i_mapping; struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; bool page_contig = true; @@ -3890,7 +3910,7 @@ reallocate: for (int i = 0; i < num_folios; i++) { struct folio *folio; - ret = attach_eb_folio_to_filemap(eb, i, &existing_eb); + ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); if (ret > 0) { ASSERT(existing_eb); goto out; @@ -3927,24 +3947,6 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; - eb->folio_size = folio_size(folio); - eb->folio_shift = folio_shift(folio); - spin_lock(&mapping->i_private_lock); - /* Should not fail, as we have preallocated the memory */ - ret = attach_extent_buffer_folio(eb, folio, prealloc); - ASSERT(!ret); - /* - * To inform we have extra eb under allocation, so that - * detach_extent_buffer_page() won't release the folio private - * when the eb hasn't yet been inserted into radix tree. - * - * The ref will be decreased when the eb released the page, in - * detach_extent_buffer_page(). - * Thus needs no special handling in error path. - */ - btrfs_folio_inc_eb_refs(fs_info, folio); - spin_unlock(&mapping->i_private_lock); - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); /* diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e764ac3f22e2..d90138683a0a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1885,6 +1885,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); + clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags); } else { /* * Get our ordered extents as soon as possible to avoid doing @@ -1894,6 +1895,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), &ctx.ordered_extents); ret = filemap_fdatawait_range(inode->i_mapping, start, end); + if (ret) + goto out_release_extents; + + /* + * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after + * starting and waiting for writeback, because for buffered IO + * it may have been set during the end IO callback + * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in + * case an error happened and we need to wait for ordered + * extents to complete so that any extent maps that point to + * unwritten locations are dropped and we don't log them. + */ + if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, + &BTRFS_I(inode)->runtime_flags)) + ret = btrfs_wait_ordered_range(inode, start, len); } if (ret) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 3ab8dea5036b..dabc3d0793cf 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2697,7 +2697,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; int bg_reclaim_threshold = 0; - bool initial = (size == block_group->length); + bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); u64 reclaimable_unusable; WARN_ON(!initial && offset + size > block_group->zone_capacity); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 753db965f7c0..3a2b902b2d1f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10385,7 +10385,7 @@ out_unlock: out_folios: for (i = 0; i < nr_folios; i++) { if (folios[i]) - __folio_put(folios[i]); + folio_put(folios[i]); } kvfree(folios); out: diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c5bdd674f55c..35a413ce935d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -388,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + /* + * If this is a COW write it means we created new extent maps for the + * range and they point to unwritten locations if we got an error either + * before submitting a bio or during IO. + * + * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we + * are queuing its completion below. During completion, at + * btrfs_finish_one_ordered(), we will drop the extent maps for the + * unwritten extents. + * + * However because completion runs in a work queue we can end up having + * a fast fsync running before that. In the case of direct IO, once we + * unlock the inode the fsync might start, and we queue the completion + * before unlocking the inode. In the case of buffered IO when writeback + * finishes (end_bbio_data_write()) we queue the completion, so if the + * writeback was triggered by a fast fsync, the fsync might start + * logging before ordered extent completion runs in the work queue. + * + * The fast fsync will log file extent items based on the extent maps it + * finds, so if by the time it collects extent maps the ordered extent + * completion didn't happen yet, it will log file extent items that + * point to unwritten extents, resulting in a corruption if a crash + * happens and the log tree is replayed. Note that a fast fsync does not + * wait for completion of ordered extents in order to reduce latency. + * + * Set a flag in the inode so that the next fast fsync will wait for + * ordered extents to complete before starting to log. + */ + if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); + if (ret) btrfs_queue_ordered_fn(ordered); return ret; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fc2a7ea26354..39a15cca58ca 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1351,7 +1351,7 @@ static int flush_reservations(struct btrfs_fs_info *fs_info) int btrfs_quota_disable(struct btrfs_fs_info *fs_info) { - struct btrfs_root *quota_root; + struct btrfs_root *quota_root = NULL; struct btrfs_trans_handle *trans = NULL; int ret = 0; @@ -1449,9 +1449,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_free_tree_block(trans, btrfs_root_id(quota_root), quota_root->node, 0, 1); - btrfs_put_root(quota_root); out: + btrfs_put_root(quota_root); mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans) btrfs_end_transaction(trans); @@ -3062,8 +3062,6 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_inherit *inherit, size_t size) { - if (!btrfs_qgroup_enabled(fs_info)) - return 0; if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) return -EOPNOTSUPP; if (size < sizeof(*inherit) || size > PAGE_SIZE) @@ -3085,6 +3083,14 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, return -EINVAL; /* + * Skip the inherit source qgroups check if qgroup is not enabled. + * Qgroup can still be later enabled causing problems, but in that case + * btrfs_qgroup_inherit() would just ignore those invalid ones. + */ + if (!btrfs_qgroup_enabled(fs_info)) + return 0; + + /* * Now check all the remaining qgroups, they should all: * * - Exist diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index cf531255ab76..9522a8b79d22 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, u32 item_size = btrfs_item_size(leaf, slot); unsigned long end, ptr; u64 offset, flags, count; - int type, ret; + int type; + int ret = 0; ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); @@ -486,7 +487,11 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, key->objectid, key->offset); break; case BTRFS_EXTENT_OWNER_REF_KEY: - WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) { + btrfs_err(fs_info, + "found extent owner ref without simple quotas enabled"); + ret = -EINVAL; + } break; default: btrfs_err(fs_info, "invalid key type in iref"); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index afd6932f5e89..d7caa3732f07 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1688,20 +1688,24 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, (i << fs_info->sectorsize_bits); int err; - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, - fs_info, scrub_read_endio, stripe); - bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; - io_stripe.is_scrub = true; + stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; + /* + * For RST cases, we need to manually split the bbio to + * follow the RST boundary. + */ err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &stripe_len, &bioc, &io_stripe, - &mirror); + &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); - if (err) { - btrfs_bio_end_io(bbio, - errno_to_blk_status(err)); - return; + if (err < 0) { + set_bit(i, &stripe->io_error_bitmap); + set_bit(i, &stripe->error_bitmap); + continue; } + + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, + fs_info, scrub_read_endio, stripe); + bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; } __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d620323d08ea..ae8c56442549 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -373,11 +373,18 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, * "optimal" chunk size based on the fs size. However when we actually * allocate the chunk we will strip this down further, making it no more * than 10% of the disk or 1G, whichever is smaller. + * + * On the zoned mode, we need to use zone_size (= + * data_sinfo->chunk_size) as it is. */ data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - data_chunk_size = min(data_sinfo->chunk_size, - mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); - data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + if (!btrfs_is_zoned(fs_info)) { + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + } else { + data_chunk_size = data_sinfo->chunk_size; + } /* * Since data allocations immediately use block groups as part of the @@ -405,6 +412,17 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, avail >>= 3; else avail >>= 1; + + /* + * On the zoned mode, we always allocate one zone as one chunk. + * Returning non-zone size alingned bytes here will result in + * less pressure for the async metadata reclaim process, and it + * will over-commit too much leading to ENOSPC. Align down to the + * zone size to avoid that. + */ + if (btrfs_is_zoned(fs_info)) + avail = ALIGN_DOWN(avail, fs_info->zone_size); + return avail; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5146387b416b..0bce1d45e252 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -138,6 +138,25 @@ static void wait_log_commit(struct btrfs_root *root, int transid); * and once to do all the other items. */ +static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) +{ + unsigned int nofs_flag; + struct inode *inode; + + /* + * We're holding a transaction handle whether we are logging or + * replaying a log tree, so we must make sure NOFS semantics apply + * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL + * to allocate an inode, which can recurse back into the filesystem and + * attempt a transaction commit, resulting in a deadlock. + */ + nofs_flag = memalloc_nofs_save(); + inode = btrfs_iget(root->fs_info->sb, objectid, root); + memalloc_nofs_restore(nofs_flag); + + return inode; +} + /* * start a sub transaction and setup the log tree * this increments the log tree writer count to make the people @@ -600,7 +619,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root, { struct inode *inode; - inode = btrfs_iget(root->fs_info->sb, objectid, root); + inode = btrfs_iget_logging(objectid, root); if (IS_ERR(inode)) inode = NULL; return inode; @@ -4860,18 +4879,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, path->slots[0]++; continue; } - if (!dropped_extents) { - /* - * Avoid logging extent items logged in past fsync calls - * and leading to duplicate keys in the log tree. - */ + /* + * Avoid overlapping items in the log tree. The first time we + * get here, get rid of everything from a past fsync. After + * that, if the current extent starts before the end of the last + * extent we copied, truncate the last one. This can happen if + * an ordered extent completion modifies the subvolume tree + * while btrfs_next_leaf() has the tree unlocked. + */ + if (!dropped_extents || key.offset < truncate_offset) { ret = truncate_inode_items(trans, root->log_root, inode, - truncate_offset, + min(key.offset, truncate_offset), BTRFS_EXTENT_DATA_KEY); if (ret) goto out; dropped_extents = true; } + truncate_offset = btrfs_file_extent_end(path); if (ins_nr == 0) start_slot = slot; ins_nr++; @@ -5433,7 +5457,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_root *root = start_inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; LIST_HEAD(dir_list); struct btrfs_dir_list *dir_elem; @@ -5494,7 +5517,7 @@ again: continue; btrfs_release_path(path); - di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); + di_inode = btrfs_iget_logging(di_key.objectid, root); if (IS_ERR(di_inode)) { ret = PTR_ERR(di_inode); goto out; @@ -5554,7 +5577,7 @@ again: btrfs_add_delayed_iput(curr_inode); curr_inode = NULL; - vfs_inode = btrfs_iget(fs_info->sb, ino, root); + vfs_inode = btrfs_iget_logging(ino, root); if (IS_ERR(vfs_inode)) { ret = PTR_ERR(vfs_inode); break; @@ -5649,7 +5672,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) return BTRFS_LOG_FORCE_COMMIT; - inode = btrfs_iget(root->fs_info->sb, ino, root); + inode = btrfs_iget_logging(ino, root); /* * If the other inode that had a conflicting dir entry was deleted in * the current transaction then we either: @@ -5750,7 +5773,6 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; /* @@ -5781,7 +5803,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, list_del(&curr->list); kfree(curr); - inode = btrfs_iget(fs_info->sb, ino, root); + inode = btrfs_iget_logging(ino, root); /* * If the other inode that had a conflicting dir entry was * deleted in the current transaction, we need to log its parent @@ -5792,7 +5814,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, if (ret != -ENOENT) break; - inode = btrfs_iget(fs_info->sb, parent, root); + inode = btrfs_iget_logging(parent, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); break; @@ -6314,7 +6336,6 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { const bool orig_log_new_dentries = ctx->log_new_dentries; - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_item *item; int ret = 0; @@ -6340,7 +6361,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, if (key.type == BTRFS_ROOT_ITEM_KEY) continue; - di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root); + di_inode = btrfs_iget_logging(key.objectid, inode->root); if (IS_ERR(di_inode)) { ret = PTR_ERR(di_inode); break; @@ -6724,7 +6745,6 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_path *path; struct btrfs_key key; @@ -6789,8 +6809,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, cur_offset = item_size; } - dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, - root); + dir_inode = btrfs_iget_logging(inode_key.objectid, root); /* * If the parent inode was deleted, return an error to * fallback to a transaction commit. This is to prevent @@ -6852,7 +6871,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); while (true) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; int slot; struct btrfs_key search_key; @@ -6867,7 +6885,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, search_key.objectid = found_key.offset; search_key.type = BTRFS_INODE_ITEM_KEY; search_key.offset = 0; - inode = btrfs_iget(fs_info->sb, ino, root); + inode = btrfs_iget_logging(ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 6465e2574230..06cdf1a8a16f 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -133,7 +133,7 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file) return 0; } -static void cachefiles_flush_reqs(struct cachefiles_cache *cache) +void cachefiles_flush_reqs(struct cachefiles_cache *cache) { struct xarray *xa = &cache->reqs; struct cachefiles_req *req; @@ -159,6 +159,7 @@ static void cachefiles_flush_reqs(struct cachefiles_cache *cache) xa_for_each(xa, index, req) { req->error = -EIO; complete(&req->done); + __xa_erase(xa, index); } xa_unlock(xa); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index d33169f0018b..6845a90cdfcc 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -55,6 +55,7 @@ struct cachefiles_ondemand_info { int ondemand_id; enum cachefiles_object_state state; struct cachefiles_object *object; + spinlock_t lock; }; /* @@ -138,6 +139,7 @@ static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache) struct cachefiles_req { struct cachefiles_object *object; struct completion done; + refcount_t ref; int error; struct cachefiles_msg msg; }; @@ -186,6 +188,7 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache, * daemon.c */ extern const struct file_operations cachefiles_daemon_fops; +extern void cachefiles_flush_reqs(struct cachefiles_cache *cache); extern void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache); extern void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache); @@ -424,6 +427,8 @@ do { \ pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \ fscache_io_error((___cache)->cache); \ set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ + if (cachefiles_in_ondemand_mode(___cache)) \ + cachefiles_flush_reqs(___cache); \ } while (0) #define cachefiles_io_error_obj(object, FMT, ...) \ diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 4ba42f1fa3b4..bce005f2b456 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -1,22 +1,42 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <linux/fdtable.h> #include <linux/anon_inodes.h> #include <linux/uio.h> #include "internal.h" +struct ondemand_anon_file { + struct file *file; + int fd; +}; + +static inline void cachefiles_req_put(struct cachefiles_req *req) +{ + if (refcount_dec_and_test(&req->ref)) + kfree(req); +} + static int cachefiles_ondemand_fd_release(struct inode *inode, struct file *file) { struct cachefiles_object *object = file->private_data; - struct cachefiles_cache *cache = object->volume->cache; - struct cachefiles_ondemand_info *info = object->ondemand; - int object_id = info->ondemand_id; + struct cachefiles_cache *cache; + struct cachefiles_ondemand_info *info; + int object_id; struct cachefiles_req *req; - XA_STATE(xas, &cache->reqs, 0); + XA_STATE(xas, NULL, 0); + + if (!object) + return 0; + + info = object->ondemand; + cache = object->volume->cache; + xas.xa = &cache->reqs; xa_lock(&cache->reqs); + spin_lock(&info->lock); + object_id = info->ondemand_id; info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED; cachefiles_ondemand_set_object_close(object); + spin_unlock(&info->lock); /* Only flush CACHEFILES_REQ_NEW marked req to avoid race with daemon_read */ xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) { @@ -76,12 +96,12 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos, } static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) + unsigned long id) { struct cachefiles_object *object = filp->private_data; struct cachefiles_cache *cache = object->volume->cache; struct cachefiles_req *req; - unsigned long id; + XA_STATE(xas, &cache->reqs, id); if (ioctl != CACHEFILES_IOC_READ_COMPLETE) return -EINVAL; @@ -89,10 +109,15 @@ static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl, if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags)) return -EOPNOTSUPP; - id = arg; - req = xa_erase(&cache->reqs, id); - if (!req) + xa_lock(&cache->reqs); + req = xas_load(&xas); + if (!req || req->msg.opcode != CACHEFILES_OP_READ || + req->object != object) { + xa_unlock(&cache->reqs); return -EINVAL; + } + xas_store(&xas, NULL); + xa_unlock(&cache->reqs); trace_cachefiles_ondemand_cread(object, id); complete(&req->done); @@ -116,10 +141,12 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) { struct cachefiles_req *req; struct fscache_cookie *cookie; + struct cachefiles_ondemand_info *info; char *pid, *psize; unsigned long id; long size; int ret; + XA_STATE(xas, &cache->reqs, 0); if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags)) return -EOPNOTSUPP; @@ -143,10 +170,18 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) if (ret) return ret; - req = xa_erase(&cache->reqs, id); - if (!req) + xa_lock(&cache->reqs); + xas.xa_index = id; + req = xas_load(&xas); + if (!req || req->msg.opcode != CACHEFILES_OP_OPEN || + !req->object->ondemand->ondemand_id) { + xa_unlock(&cache->reqs); return -EINVAL; + } + xas_store(&xas, NULL); + xa_unlock(&cache->reqs); + info = req->object->ondemand; /* fail OPEN request if copen format is invalid */ ret = kstrtol(psize, 0, &size); if (ret) { @@ -166,6 +201,32 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) goto out; } + spin_lock(&info->lock); + /* + * The anonymous fd was closed before copen ? Fail the request. + * + * t1 | t2 + * --------------------------------------------------------- + * cachefiles_ondemand_copen + * req = xa_erase(&cache->reqs, id) + * // Anon fd is maliciously closed. + * cachefiles_ondemand_fd_release + * xa_lock(&cache->reqs) + * cachefiles_ondemand_set_object_close(object) + * xa_unlock(&cache->reqs) + * cachefiles_ondemand_set_object_open + * // No one will ever close it again. + * cachefiles_ondemand_daemon_read + * cachefiles_ondemand_select_req + * + * Get a read req but its fd is already closed. The daemon can't + * issue a cread ioctl with an closed fd, then hung. + */ + if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED) { + spin_unlock(&info->lock); + req->error = -EBADFD; + goto out; + } cookie = req->object->cookie; cookie->object_size = size; if (size) @@ -175,9 +236,15 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) trace_cachefiles_ondemand_copen(req->object, id, size); cachefiles_ondemand_set_object_open(req->object); + spin_unlock(&info->lock); wake_up_all(&cache->daemon_pollwq); out: + spin_lock(&info->lock); + /* Need to set object close to avoid reopen status continuing */ + if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED) + cachefiles_ondemand_set_object_close(req->object); + spin_unlock(&info->lock); complete(&req->done); return ret; } @@ -205,14 +272,14 @@ int cachefiles_ondemand_restore(struct cachefiles_cache *cache, char *args) return 0; } -static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) +static int cachefiles_ondemand_get_fd(struct cachefiles_req *req, + struct ondemand_anon_file *anon_file) { struct cachefiles_object *object; struct cachefiles_cache *cache; struct cachefiles_open *load; - struct file *file; u32 object_id; - int ret, fd; + int ret; object = cachefiles_grab_object(req->object, cachefiles_obj_get_ondemand_fd); @@ -224,35 +291,53 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) if (ret < 0) goto err; - fd = get_unused_fd_flags(O_WRONLY); - if (fd < 0) { - ret = fd; + anon_file->fd = get_unused_fd_flags(O_WRONLY); + if (anon_file->fd < 0) { + ret = anon_file->fd; goto err_free_id; } - file = anon_inode_getfile("[cachefiles]", &cachefiles_ondemand_fd_fops, - object, O_WRONLY); - if (IS_ERR(file)) { - ret = PTR_ERR(file); + anon_file->file = anon_inode_getfile("[cachefiles]", + &cachefiles_ondemand_fd_fops, object, O_WRONLY); + if (IS_ERR(anon_file->file)) { + ret = PTR_ERR(anon_file->file); goto err_put_fd; } - file->f_mode |= FMODE_PWRITE | FMODE_LSEEK; - fd_install(fd, file); + spin_lock(&object->ondemand->lock); + if (object->ondemand->ondemand_id > 0) { + spin_unlock(&object->ondemand->lock); + /* Pair with check in cachefiles_ondemand_fd_release(). */ + anon_file->file->private_data = NULL; + ret = -EEXIST; + goto err_put_file; + } + + anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK; load = (void *)req->msg.data; - load->fd = fd; + load->fd = anon_file->fd; object->ondemand->ondemand_id = object_id; + spin_unlock(&object->ondemand->lock); cachefiles_get_unbind_pincount(cache); trace_cachefiles_ondemand_open(object, &req->msg, load); return 0; +err_put_file: + fput(anon_file->file); + anon_file->file = NULL; err_put_fd: - put_unused_fd(fd); + put_unused_fd(anon_file->fd); + anon_file->fd = ret; err_free_id: xa_erase(&cache->ondemand_ids, object_id); err: + spin_lock(&object->ondemand->lock); + /* Avoid marking an opened object as closed. */ + if (object->ondemand->ondemand_id <= 0) + cachefiles_ondemand_set_object_close(object); + spin_unlock(&object->ondemand->lock); cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd); return ret; } @@ -294,14 +379,28 @@ static struct cachefiles_req *cachefiles_ondemand_select_req(struct xa_state *xa return NULL; } +static inline bool cachefiles_ondemand_finish_req(struct cachefiles_req *req, + struct xa_state *xas, int err) +{ + if (unlikely(!xas || !req)) + return false; + + if (xa_cmpxchg(xas->xa, xas->xa_index, req, NULL, 0) != req) + return false; + + req->error = err; + complete(&req->done); + return true; +} + ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, char __user *_buffer, size_t buflen) { struct cachefiles_req *req; struct cachefiles_msg *msg; - unsigned long id = 0; size_t n; int ret = 0; + struct ondemand_anon_file anon_file; XA_STATE(xas, &cache->reqs, cache->req_id_next); xa_lock(&cache->reqs); @@ -330,42 +429,37 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, xas_clear_mark(&xas, CACHEFILES_REQ_NEW); cache->req_id_next = xas.xa_index + 1; + refcount_inc(&req->ref); + cachefiles_grab_object(req->object, cachefiles_obj_get_read_req); xa_unlock(&cache->reqs); - id = xas.xa_index; - if (msg->opcode == CACHEFILES_OP_OPEN) { - ret = cachefiles_ondemand_get_fd(req); - if (ret) { - cachefiles_ondemand_set_object_close(req->object); - goto error; - } + ret = cachefiles_ondemand_get_fd(req, &anon_file); + if (ret) + goto out; } - msg->msg_id = id; + msg->msg_id = xas.xa_index; msg->object_id = req->object->ondemand->ondemand_id; - if (copy_to_user(_buffer, msg, n) != 0) { + if (copy_to_user(_buffer, msg, n) != 0) ret = -EFAULT; - goto err_put_fd; - } - /* CLOSE request has no reply */ - if (msg->opcode == CACHEFILES_OP_CLOSE) { - xa_erase(&cache->reqs, id); - complete(&req->done); + if (msg->opcode == CACHEFILES_OP_OPEN) { + if (ret < 0) { + fput(anon_file.file); + put_unused_fd(anon_file.fd); + goto out; + } + fd_install(anon_file.fd, anon_file.file); } - - return n; - -err_put_fd: - if (msg->opcode == CACHEFILES_OP_OPEN) - close_fd(((struct cachefiles_open *)msg->data)->fd); -error: - xa_erase(&cache->reqs, id); - req->error = ret; - complete(&req->done); - return ret; +out: + cachefiles_put_object(req->object, cachefiles_obj_put_read_req); + /* Remove error request and CLOSE request has no reply */ + if (ret || msg->opcode == CACHEFILES_OP_CLOSE) + cachefiles_ondemand_finish_req(req, &xas, ret); + cachefiles_req_put(req); + return ret ? ret : n; } typedef int (*init_req_fn)(struct cachefiles_req *req, void *private); @@ -395,6 +489,7 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, goto out; } + refcount_set(&req->ref, 1); req->object = object; init_completion(&req->done); req->msg.opcode = opcode; @@ -454,9 +549,19 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, goto out; wake_up_all(&cache->daemon_pollwq); - wait_for_completion(&req->done); - ret = req->error; - kfree(req); +wait: + ret = wait_for_completion_killable(&req->done); + if (!ret) { + ret = req->error; + } else { + ret = -EINTR; + if (!cachefiles_ondemand_finish_req(req, &xas, ret)) { + /* Someone will complete it soon. */ + cpu_relax(); + goto wait; + } + } + cachefiles_req_put(req); return ret; out: /* Reset the object to close state in error handling path. @@ -578,6 +683,7 @@ int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, return -ENOMEM; object->ondemand->object = object; + spin_lock_init(&object->ondemand->lock); INIT_WORK(&object->ondemand->ondemand_work, ondemand_object_worker); return 0; } diff --git a/fs/dcache.c b/fs/dcache.c index 407095188f83..d58dc9e58f3b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3029,28 +3029,25 @@ EXPORT_SYMBOL(d_splice_alias); bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { - bool result; + bool subdir; unsigned seq; if (new_dentry == old_dentry) return true; - do { - /* for restarting inner loop in case of seq retry */ - seq = read_seqbegin(&rename_lock); - /* - * Need rcu_readlock to protect against the d_parent trashing - * due to d_move - */ - rcu_read_lock(); - if (d_ancestor(old_dentry, new_dentry)) - result = true; - else - result = false; - rcu_read_unlock(); - } while (read_seqretry(&rename_lock, seq)); - - return result; + /* Access d_parent under rcu as d_move() may change it. */ + rcu_read_lock(); + seq = read_seqbegin(&rename_lock); + subdir = d_ancestor(old_dentry, new_dentry); + /* Try lockless once... */ + if (read_seqretry(&rename_lock, seq)) { + /* ...else acquire lock for progress even on deep chains. */ + read_seqlock_excl(&rename_lock); + subdir = d_ancestor(old_dentry, new_dentry); + read_sequnlock_excl(&rename_lock); + } + rcu_read_unlock(); + return subdir; } EXPORT_SYMBOL(is_subdir); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index dc51df0b118d..8fd928899a59 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -107,8 +107,16 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param int opt; opt = fs_parse(fc, debugfs_param_specs, param, &result); - if (opt < 0) + if (opt < 0) { + /* + * We might like to report bad mount options here; but + * traditionally debugfs has ignored all mount options + */ + if (opt == -ENOPARAM) + return 0; + return opt; + } switch (opt) { case Opt_uid: diff --git a/fs/erofs/super.c b/fs/erofs/super.c index c93bd24d2771..1b91d9513013 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -343,7 +343,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->build_time = le64_to_cpu(dsb->build_time); sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); - memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid)); + super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); ret = strscpy(sbi->volume_name, dsb->volume_name, sizeof(dsb->volume_name)); diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 9b248ee5fef2..74d3d7bffcf3 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -711,6 +711,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, err = z_erofs_do_map_blocks(inode, map, flags); out: + if (err) + map->m_llen = 0; trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); return err; } diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 036024bce9f7..b80f612867c2 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -148,7 +148,7 @@ int __init z_erofs_gbuf_init(void) void z_erofs_gbuf_exit(void) { - int i; + int i, j; for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) { struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i]; @@ -161,9 +161,9 @@ void z_erofs_gbuf_exit(void) if (!gbuf->pages) continue; - for (i = 0; i < gbuf->nrpages; ++i) - if (gbuf->pages[i]) - put_page(gbuf->pages[i]); + for (j = 0; j < gbuf->nrpages; ++j) + if (gbuf->pages[j]) + put_page(gbuf->pages[j]); kfree(gbuf->pages); gbuf->pages = NULL; } diff --git a/fs/file.c b/fs/file.c index 8076aef9c210..a3b72aa64f11 100644 --- a/fs/file.c +++ b/fs/file.c @@ -486,12 +486,12 @@ struct files_struct init_files = { static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { - unsigned int maxfd = fdt->max_fds; + unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; - if (bitbit > maxfd) + if (bitbit >= maxfd) return maxfd; if (bitbit > start) start = bitbit; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c5802a459334..d46558990279 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -241,6 +241,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, unsigned block_size = (1 << block_bits); size_t poff = offset_in_folio(folio, *pos); size_t plen = min_t(loff_t, folio_size(folio) - poff, length); + size_t orig_plen = plen; unsigned first = poff >> block_bits; unsigned last = (poff + plen - 1) >> block_bits; @@ -277,7 +278,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ - if (orig_pos <= isize && orig_pos + length > isize) { + if (orig_pos <= isize && orig_pos + orig_plen > isize) { unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; if (first <= end && last > end) @@ -877,22 +878,37 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t old_size = iter->inode->i_size; + size_t written; if (srcmap->type == IOMAP_INLINE) { iomap_write_end_inline(iter, folio, pos, copied); - return true; + written = copied; + } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { + written = block_write_end(NULL, iter->inode->i_mapping, pos, + len, copied, &folio->page, NULL); + WARN_ON_ONCE(written != copied && written != 0); + } else { + written = __iomap_write_end(iter->inode, pos, len, copied, + folio) ? copied : 0; } - if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { - size_t bh_written; - - bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, - len, copied, &folio->page, NULL); - WARN_ON_ONCE(bh_written != copied && bh_written != 0); - return bh_written == copied; + /* + * Update the in-memory inode size after copying the data into the page + * cache. It's up to the file system to write the updated size to disk, + * preferably after I/O completion so that no stale data is exposed. + * Only once that's done can we unlock and release the folio. + */ + if (pos + written > old_size) { + i_size_write(iter->inode, pos + written); + iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } + __iomap_put_folio(iter, pos, written, folio); - return __iomap_write_end(iter->inode, pos, len, copied, folio); + if (old_size < pos) + pagecache_isize_extended(iter->inode, old_size, pos); + + return written == copied; } static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) @@ -907,7 +923,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) do { struct folio *folio; - loff_t old_size; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ @@ -959,23 +974,6 @@ retry: written = iomap_write_end(iter, pos, bytes, copied, folio) ? copied : 0; - /* - * Update the in-memory inode size after copying the data into - * the page cache. It's up to the file system to write the - * updated size to disk, preferably after I/O completion so that - * no stale data is exposed. Only once that's done can we - * unlock and release the folio. - */ - old_size = iter->inode->i_size; - if (pos + written > old_size) { - i_size_write(iter->inode, pos + written); - iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; - } - __iomap_put_folio(iter, pos, written, folio); - - if (old_size < pos) - pagecache_isize_extended(iter->inode, old_size, pos); - cond_resched(); if (unlikely(written == 0)) { /* @@ -1346,7 +1344,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) bytes = folio_size(folio) - offset; ret = iomap_write_end(iter, pos, bytes, bytes, folio); - __iomap_put_folio(iter, pos, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; @@ -1412,7 +1409,6 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) folio_mark_accessed(folio); ret = iomap_write_end(iter, pos, bytes, bytes, folio); - __iomap_put_folio(iter, pos, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 0fb7afac298e..9987055293b3 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -557,9 +557,11 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) size_check: if (EALIST_SIZE(ea_buf->xattr) != ea_size) { + int size = min_t(int, EALIST_SIZE(ea_buf->xattr), ea_size); + printk(KERN_ERR "ea_get: invalid extended attribute\n"); print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, - ea_buf->xattr, ea_size, 1); + ea_buf->xattr, size, 1); ea_release(inode, ea_buf); rc = -EIO; goto clean_up; diff --git a/fs/locks.c b/fs/locks.c index 90c8746874de..c360d1992d21 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2448,8 +2448,9 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, error = do_lock_file_wait(filp, cmd, file_lock); /* - * Attempt to detect a close/fcntl race and recover by releasing the - * lock that was just acquired. There is no need to do that when we're + * Detect close/fcntl races and recover by zapping all POSIX locks + * associated with this file and our files_struct, just like on + * filp_flush(). There is no need to do that when we're * unlocking though, or for OFD locks. */ if (!error && file_lock->c.flc_type != F_UNLCK && @@ -2464,9 +2465,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, f = files_lookup_fd_locked(files, fd); spin_unlock(&files->file_lock); if (f != filp) { - file_lock->c.flc_type = F_UNLCK; - error = do_lock_file_wait(filp, cmd, file_lock); - WARN_ON_ONCE(error); + locks_remove_posix(filp, files); error = -EBADF; } } diff --git a/fs/namei.c b/fs/namei.c index 37fb0a8aa09a..1e05a0f3f04d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3572,8 +3572,12 @@ static const char *open_last_lookups(struct nameidata *nd, else inode_lock_shared(dir->d_inode); dentry = lookup_open(nd, file, op, got_write); - if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED)) - fsnotify_create(dir->d_inode, dentry); + if (!IS_ERR(dentry)) { + if (file->f_mode & FMODE_CREATED) + fsnotify_create(dir->d_inode, dentry); + if (file->f_mode & FMODE_OPENED) + fsnotify_open(file); + } if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else @@ -3700,6 +3704,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap, mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); + if (file->f_mode & FMODE_OPENED) + fsnotify_open(file); if (error) return error; /* Don't check for other permissions, the inode was just created */ diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 07bc1fd43530..d583af7a2209 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -523,6 +523,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr struct netfs_group *group; struct folio *folio = page_folio(vmf->page); struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; struct inode *inode = file_inode(file); struct netfs_inode *ictx = netfs_inode(inode); vm_fault_t ret = VM_FAULT_RETRY; @@ -534,6 +535,11 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr if (folio_lock_killable(folio) < 0) goto out; + if (folio->mapping != mapping) { + folio_unlock(folio); + ret = VM_FAULT_NOPAGE; + goto out; + } if (folio_wait_writeback_killable(folio)) { ret = VM_FAULT_LOCKED; @@ -549,9 +555,9 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr group = netfs_folio_group(folio); if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) { folio_unlock(folio); - err = filemap_fdatawait_range(inode->i_mapping, - folio_pos(folio), - folio_pos(folio) + folio_size(folio)); + err = filemap_fdatawrite_range(mapping, + folio_pos(folio), + folio_pos(folio) + folio_size(folio)); switch (err) { case 0: ret = VM_FAULT_RETRY; diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index e14cd53ac9fd..88f2adfab75e 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -92,8 +92,9 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); if (async) wreq->iocb = iocb; + wreq->len = iov_iter_count(&wreq->io_iter); wreq->cleanup = netfs_cleanup_dio_write; - ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter)); + ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); if (ret < 0) { _debug("begin = %zd", ret); goto out; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 95e281a8af78..acd9ca14e264 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -63,15 +63,6 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} /* * misc.c */ -#define NETFS_FLAG_PUT_MARK BIT(0) -#define NETFS_FLAG_PAGECACHE_MARK BIT(1) -int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, - struct folio *folio, unsigned int flags, - gfp_t gfp_mask); -int netfs_add_folios_to_buffer(struct xarray *buffer, - struct address_space *mapping, - pgoff_t index, pgoff_t to, gfp_t gfp_mask); -void netfs_clear_buffer(struct xarray *buffer); /* * objects.c diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index bc1fc54fb724..83e644bd518f 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -8,87 +8,6 @@ #include <linux/swap.h> #include "internal.h" -/* - * Attach a folio to the buffer and maybe set marks on it to say that we need - * to put the folio later and twiddle the pagecache flags. - */ -int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, - struct folio *folio, unsigned int flags, - gfp_t gfp_mask) -{ - XA_STATE_ORDER(xas, xa, index, folio_order(folio)); - -retry: - xas_lock(&xas); - for (;;) { - xas_store(&xas, folio); - if (!xas_error(&xas)) - break; - xas_unlock(&xas); - if (!xas_nomem(&xas, gfp_mask)) - return xas_error(&xas); - goto retry; - } - - if (flags & NETFS_FLAG_PUT_MARK) - xas_set_mark(&xas, NETFS_BUF_PUT_MARK); - if (flags & NETFS_FLAG_PAGECACHE_MARK) - xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK); - xas_unlock(&xas); - return xas_error(&xas); -} - -/* - * Create the specified range of folios in the buffer attached to the read - * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that - * these need freeing later. - */ -int netfs_add_folios_to_buffer(struct xarray *buffer, - struct address_space *mapping, - pgoff_t index, pgoff_t to, gfp_t gfp_mask) -{ - struct folio *folio; - int ret; - - if (to + 1 == index) /* Page range is inclusive */ - return 0; - - do { - /* TODO: Figure out what order folio can be allocated here */ - folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0); - if (!folio) - return -ENOMEM; - folio->index = index; - ret = netfs_xa_store_and_mark(buffer, index, folio, - NETFS_FLAG_PUT_MARK, gfp_mask); - if (ret < 0) { - folio_put(folio); - return ret; - } - - index += folio_nr_pages(folio); - } while (index <= to && index != 0); - - return 0; -} - -/* - * Clear an xarray buffer, putting a ref on the folios that have - * NETFS_BUF_PUT_MARK set. - */ -void netfs_clear_buffer(struct xarray *buffer) -{ - struct folio *folio; - XA_STATE(xas, buffer, 0); - - rcu_read_lock(); - xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) { - folio_put(folio); - } - rcu_read_unlock(); - xa_destroy(buffer); -} - /** * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback * @mapping: The mapping the folio belongs to. diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 3aa86e268f40..ec6cf8707fb0 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -483,7 +483,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, if (!debug) kdebug("R=%x: No submit", wreq->debug_id); - if (flen < fsize) + if (foff + flen < fsize) for (int s = 0; s < NR_IO_STREAMS; s++) netfs_issue_write(wreq, &wreq->io_streams[s]); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 342930996226..07a7be27182e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1627,7 +1627,16 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, switch (error) { case 1: break; - case 0: + case -ETIMEDOUT: + if (inode && (IS_ROOT(dentry) || + NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)) + error = 1; + break; + case -ESTALE: + case -ENOENT: + error = 0; + fallthrough; + default: /* * We can't d_drop the root of a disconnected tree: * its d_hash is on the s_anon list and d_drop() would hide @@ -1682,18 +1691,8 @@ static int nfs_lookup_revalidate_dentry(struct inode *dir, dir_verifier = nfs_save_change_attribute(dir); ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); - if (ret < 0) { - switch (ret) { - case -ESTALE: - case -ENOENT: - ret = 0; - break; - case -ETIMEDOUT: - if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) - ret = 1; - } + if (ret < 0) goto out; - } /* Request help from readdirplus */ nfs_lookup_advise_force_readdirplus(dir, flags); @@ -1737,7 +1736,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; - int error; + int error = 0; nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = d_inode(dentry); @@ -1782,7 +1781,7 @@ out_valid: out_bad: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_done(dir, dentry, inode, 0); + return nfs_lookup_revalidate_done(dir, dentry, inode, error); } static int @@ -1804,9 +1803,10 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; } else { - /* Wait for unlink to complete */ + /* Wait for unlink to complete - see unblock_revalidate() */ wait_var_event(&dentry->d_fsdata, - dentry->d_fsdata != NFS_FSDATA_BLOCKED); + smp_load_acquire(&dentry->d_fsdata) + != NFS_FSDATA_BLOCKED); parent = dget_parent(dentry); ret = reval(d_inode(parent), dentry, flags); dput(parent); @@ -1819,6 +1819,29 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate); } +static void block_revalidate(struct dentry *dentry) +{ + /* old devname - just in case */ + kfree(dentry->d_fsdata); + + /* Any new reference that could lead to an open + * will take ->d_lock in lookup_open() -> d_lookup(). + * Holding this lock ensures we cannot race with + * __nfs_lookup_revalidate() and removes and need + * for further barriers. + */ + lockdep_assert_held(&dentry->d_lock); + + dentry->d_fsdata = NFS_FSDATA_BLOCKED; +} + +static void unblock_revalidate(struct dentry *dentry) +{ + /* store_release ensures wait_var_event() sees the update */ + smp_store_release(&dentry->d_fsdata, NULL); + wake_up_var(&dentry->d_fsdata); +} + /* * A weaker form of d_revalidate for revalidating just the d_inode(dentry) * when we don't really care about the dentry name. This is called when a @@ -2255,6 +2278,9 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, */ int error = 0; + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + return -ENAMETOOLONG; + if (open_flags & O_CREAT) { file->f_mode |= FMODE_CREATED; error = nfs_do_create(dir, dentry, mode, open_flags); @@ -2549,15 +2575,12 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); goto out; } - /* old devname */ - kfree(dentry->d_fsdata); - dentry->d_fsdata = NFS_FSDATA_BLOCKED; + block_revalidate(dentry); spin_unlock(&dentry->d_lock); error = nfs_safe_remove(dentry); nfs_dentry_remove_handle_error(dir, dentry, error); - dentry->d_fsdata = NULL; - wake_up_var(&dentry->d_fsdata); + unblock_revalidate(dentry); out: trace_nfs_unlink_exit(dir, dentry, error); return error; @@ -2664,8 +2687,7 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) { struct dentry *new_dentry = data->new_dentry; - new_dentry->d_fsdata = NULL; - wake_up_var(&new_dentry->d_fsdata); + unblock_revalidate(new_dentry); } /* @@ -2727,11 +2749,6 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) || WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED)) goto out; - if (new_dentry->d_fsdata) { - /* old devname */ - kfree(new_dentry->d_fsdata); - new_dentry->d_fsdata = NULL; - } spin_lock(&new_dentry->d_lock); if (d_count(new_dentry) > 2) { @@ -2753,7 +2770,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, new_dentry = dentry; new_inode = NULL; } else { - new_dentry->d_fsdata = NFS_FSDATA_BLOCKED; + block_revalidate(new_dentry); must_unblock = true; spin_unlock(&new_dentry->d_lock); } @@ -2765,6 +2782,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); if (IS_ERR(task)) { + if (must_unblock) + unblock_revalidate(new_dentry); error = PTR_ERR(task); goto out; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index bb2f583eb28b..90079ca134dd 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -141,8 +141,6 @@ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { ssize_t ret; - VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); - if (iov_iter_rw(iter) == READ) ret = nfs_file_direct_read(iocb, iter, true); else diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c93c12063b3a..a691fa10b3e9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4023,6 +4023,23 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location, } } +static bool _is_same_nfs4_pathname(struct nfs4_pathname *path1, + struct nfs4_pathname *path2) +{ + int i; + + if (path1->ncomponents != path2->ncomponents) + return false; + for (i = 0; i < path1->ncomponents; i++) { + if (path1->components[i].len != path2->components[i].len) + return false; + if (memcmp(path1->components[i].data, path2->components[i].data, + path1->components[i].len)) + return false; + } + return true; +} + static int _nfs4_discover_trunking(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -4056,9 +4073,13 @@ static int _nfs4_discover_trunking(struct nfs_server *server, if (status) goto out_free_3; - for (i = 0; i < locations->nlocations; i++) + for (i = 0; i < locations->nlocations; i++) { + if (!_is_same_nfs4_pathname(&locations->fs_path, + &locations->locations[i].rootpath)) + continue; test_fs_location_for_trunking(&locations->locations[i], clp, server); + } out_free_3: kfree(locations->fattr); out_free_2: @@ -6268,6 +6289,7 @@ nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) if (status == 0) nfs_setsecurity(inode, fattr); + nfs_free_fattr(fattr); return status; } #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6efb5068c116..040b6b79c75e 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1545,6 +1545,11 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) continue; } else if (index == prev->wb_index + 1) continue; + /* + * We will submit more requests after these. Indicate + * this to the underlying layers. + */ + desc->pg_moreio = 1; nfs_pageio_complete(desc); break; } diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 0e27a2e4e68b..13818129d268 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -41,7 +41,7 @@ static int nfs_symlink_filler(struct file *file, struct folio *folio) error: folio_set_error(folio); folio_unlock(folio); - return -EIO; + return error; } static const char *nfs_get_link(struct dentry *dentry, diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c index 62d2586d9902..529a75ecf22e 100644 --- a/fs/nfsd/netlink.c +++ b/fs/nfsd/netlink.c @@ -44,9 +44,7 @@ static const struct nla_policy nfsd_listener_set_nl_policy[NFSD_A_SERVER_SOCK_AD static const struct genl_split_ops nfsd_nl_ops[] = { { .cmd = NFSD_CMD_RPC_STATUS_GET, - .start = nfsd_nl_rpc_status_get_start, .dumpit = nfsd_nl_rpc_status_get_dumpit, - .done = nfsd_nl_rpc_status_get_done, .flags = GENL_CMD_CAP_DUMP, }, { diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h index e3724637d64d..2e132ef328f8 100644 --- a/fs/nfsd/netlink.h +++ b/fs/nfsd/netlink.h @@ -15,9 +15,6 @@ extern const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1]; extern const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1]; -int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb); -int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb); - int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 202140df8f82..c848ebe5d08f 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1460,28 +1460,6 @@ static int create_proc_exports_entry(void) unsigned int nfsd_net_id; -/** - * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit - * @cb: netlink metadata and command arguments - * - * Return values: - * %0: The rpc_status_get command may proceed - * %-ENODEV: There is no NFSD running in this namespace - */ -int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb) -{ - struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id); - int ret = -ENODEV; - - mutex_lock(&nfsd_mutex); - if (nn->nfsd_serv) - ret = 0; - else - mutex_unlock(&nfsd_mutex); - - return ret; -} - static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, struct netlink_callback *cb, struct nfsd_genl_rqstp *rqstp) @@ -1558,8 +1536,16 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id); int i, ret, rqstp_index = 0; + struct nfsd_net *nn; + + mutex_lock(&nfsd_mutex); + + nn = net_generic(sock_net(skb->sk), nfsd_net_id); + if (!nn->nfsd_serv) { + ret = -ENODEV; + goto out_unlock; + } rcu_read_lock(); @@ -1636,22 +1622,10 @@ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, ret = skb->len; out: rcu_read_unlock(); - - return ret; -} - -/** - * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing - * @cb: netlink metadata and command arguments - * - * Return values: - * %0: Success - */ -int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb) -{ +out_unlock: mutex_unlock(&nfsd_mutex); - return 0; + return ret; } /** @@ -2195,6 +2169,8 @@ static __net_init int nfsd_net_init(struct net *net) nn->nfsd_svcstats.program = &nfsd_program; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; + nn->nfsd_info.mutex = &nfsd_mutex; + nn->nfsd_serv = NULL; nfsd4_init_leases_net(nn); get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index cd9a6a1a9fc8..89d7918de7b1 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -672,7 +672,6 @@ int nfsd_create_serv(struct net *net) return error; } spin_lock(&nfsd_notifier_lock); - nn->nfsd_info.mutex = &nfsd_mutex; nn->nfsd_serv = serv; spin_unlock(&nfsd_notifier_lock); diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 89caef7513db..ba50388ee4bf 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -377,11 +377,12 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, * @target: offset number of an entry in the group (start point) * @bsize: size in bits * @lock: spin lock protecting @bitmap + * @wrap: whether to wrap around */ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, unsigned int bsize, - spinlock_t *lock) + spinlock_t *lock, bool wrap) { int pos, end = bsize; @@ -397,6 +398,8 @@ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, end = target; } + if (!wrap) + return -ENOSPC; /* wrap around */ for (pos = 0; pos < end; pos++) { @@ -495,9 +498,10 @@ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp) * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation + * @wrap: whether to wrap around */ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, - struct nilfs_palloc_req *req) + struct nilfs_palloc_req *req, bool wrap) { struct buffer_head *desc_bh, *bitmap_bh; struct nilfs_palloc_group_desc *desc; @@ -516,7 +520,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, entries_per_group = nilfs_palloc_entries_per_group(inode); for (i = 0; i < ngroups; i += n) { - if (group >= ngroups) { + if (group >= ngroups && wrap) { /* wrap around */ group = 0; maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr, @@ -550,7 +554,14 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); pos = nilfs_palloc_find_available_slot( - bitmap, group_offset, entries_per_group, lock); + bitmap, group_offset, entries_per_group, lock, + wrap); + /* + * Since the search for a free slot in the second and + * subsequent bitmap blocks always starts from the + * beginning, the wrap flag only has an effect on the + * first search. + */ kunmap_local(bitmap_kaddr); if (pos >= 0) goto found; diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index b667e869ac07..d825a9faca6d 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -50,8 +50,8 @@ struct nilfs_palloc_req { struct buffer_head *pr_entry_bh; }; -int nilfs_palloc_prepare_alloc_entry(struct inode *, - struct nilfs_palloc_req *); +int nilfs_palloc_prepare_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req, bool wrap); void nilfs_palloc_commit_alloc_entry(struct inode *, struct nilfs_palloc_req *); void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 180fc8d36213..fc1caf63a42a 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -75,7 +75,7 @@ int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req) { int ret; - ret = nilfs_palloc_prepare_alloc_entry(dat, req); + ret = nilfs_palloc_prepare_alloc_entry(dat, req, true); if (ret < 0) return ret; diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index a002a44ff161..dddfa604491a 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -135,6 +135,9 @@ static bool nilfs_check_folio(struct folio *folio, char *kaddr) goto Enamelen; if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) goto Espan; + if (unlikely(p->inode && + NILFS_PRIVATE_INODE(le64_to_cpu(p->inode)))) + goto Einumber; } if (offs != limit) goto Eend; @@ -160,6 +163,9 @@ Enamelen: goto bad_entry; Espan: error = "directory entry across blocks"; + goto bad_entry; +Einumber: + error = "disallowed inode number"; bad_entry: nilfs_error(sb, "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d", @@ -607,7 +613,7 @@ int nilfs_empty_dir(struct inode *inode) kaddr = nilfs_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) - continue; + return 0; de = (struct nilfs_dir_entry *)kaddr; kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 612e609158b5..1e86b9303b7c 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -56,13 +56,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, struct nilfs_palloc_req req; int ret; - req.pr_entry_nr = 0; /* - * 0 says find free inode from beginning - * of a group. dull code!! - */ + req.pr_entry_nr = NILFS_FIRST_INO(ifile->i_sb); req.pr_entry_bh = NULL; - ret = nilfs_palloc_prepare_alloc_entry(ifile, &req); + ret = nilfs_palloc_prepare_alloc_entry(ifile, &req, false); if (!ret) { ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1, &req.pr_entry_bh); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 728e90be3570..4017f7856440 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -116,9 +116,15 @@ enum { #define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino) #define NILFS_MDT_INODE(sb, ino) \ - ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino))) + ((ino) < NILFS_USER_INO && (NILFS_MDT_INO_BITS & BIT(ino))) #define NILFS_VALID_INODE(sb, ino) \ - ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino))) + ((ino) >= NILFS_FIRST_INO(sb) || \ + ((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino)))) + +#define NILFS_PRIVATE_INODE(ino) ({ \ + ino_t __ino = (ino); \ + ((__ino) < NILFS_USER_INO && (__ino) != NILFS_ROOT_INO && \ + (__ino) != NILFS_SKETCH_INO); }) /** * struct nilfs_transaction_info: context information for synchronization diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 60d4f59f7665..6ea81f1d5094 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1652,6 +1652,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) if (bh->b_folio != bd_folio) { if (bd_folio) { folio_lock(bd_folio); + folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); @@ -1665,6 +1666,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) if (bh == segbuf->sb_super_root) { if (bh->b_folio != bd_folio) { folio_lock(bd_folio); + folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); @@ -1681,6 +1683,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) } if (bd_folio) { folio_lock(bd_folio); + folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index f41d7b6d432c..e44dde57ab65 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -452,6 +452,12 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, } nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); + if (nilfs->ns_first_ino < NILFS_USER_INO) { + nilfs_err(nilfs->ns_sb, + "too small lower limit for non-reserved inode numbers: %u", + nilfs->ns_first_ino); + return -EINVAL; + } nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 85da0629415d..1e829ed7b0ef 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -182,7 +182,7 @@ struct the_nilfs { unsigned long ns_nrsvsegs; unsigned long ns_first_data_block; int ns_inode_size; - int ns_first_ino; + unsigned int ns_first_ino; u32 ns_crc_seed; /* /sys/fs/<nilfs>/<device> */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f0467d3b3c88..6be175a1ab3c 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2366,6 +2366,11 @@ static int ocfs2_dio_end_io_write(struct inode *inode, } list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { + ret = ocfs2_assure_trans_credits(handle, credits); + if (ret < 0) { + mlog_errno(ret); + break; + } ret = ocfs2_mark_extent_written(inode, &et, handle, ue->ue_cpos, 1, ue->ue_phys, diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 604fea3a26ff..530fba34f6d3 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -446,6 +446,23 @@ bail: } /* + * Make sure handle has at least 'nblocks' credits available. If it does not + * have that many credits available, we will try to extend the handle to have + * enough credits. If that fails, we will restart transaction to have enough + * credits. Similar notes regarding data consistency and locking implications + * as for ocfs2_extend_trans() apply here. + */ +int ocfs2_assure_trans_credits(handle_t *handle, int nblocks) +{ + int old_nblks = jbd2_handle_buffer_credits(handle); + + trace_ocfs2_assure_trans_credits(old_nblks); + if (old_nblks >= nblocks) + return 0; + return ocfs2_extend_trans(handle, nblocks - old_nblks); +} + +/* * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA. * If that fails, restart the transaction & regain write access for the * buffer head which is used for metadata modifications. @@ -479,12 +496,6 @@ bail: return status; } - -struct ocfs2_triggers { - struct jbd2_buffer_trigger_type ot_triggers; - int ot_offset; -}; - static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers) { return container_of(triggers, struct ocfs2_triggers, ot_triggers); @@ -548,85 +559,76 @@ static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh) { + struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers); + mlog(ML_ERROR, "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, " "bh->b_blocknr = %llu\n", (unsigned long)bh, (unsigned long long)bh->b_blocknr); - ocfs2_error(bh->b_assoc_map->host->i_sb, + ocfs2_error(ot->sb, "JBD2 has aborted our journal, ocfs2 cannot continue\n"); } -static struct ocfs2_triggers di_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dinode, i_check), -}; - -static struct ocfs2_triggers eb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_extent_block, h_check), -}; - -static struct ocfs2_triggers rb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), -}; - -static struct ocfs2_triggers gd_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), -}; - -static struct ocfs2_triggers db_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_db_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, -}; +static void ocfs2_setup_csum_triggers(struct super_block *sb, + enum ocfs2_journal_trigger_type type, + struct ocfs2_triggers *ot) +{ + BUG_ON(type >= OCFS2_JOURNAL_TRIGGER_COUNT); -static struct ocfs2_triggers xb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), -}; + switch (type) { + case OCFS2_JTR_DI: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dinode, i_check); + break; + case OCFS2_JTR_EB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_extent_block, h_check); + break; + case OCFS2_JTR_RB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_refcount_block, rf_check); + break; + case OCFS2_JTR_GD: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_group_desc, bg_check); + break; + case OCFS2_JTR_DB: + ot->ot_triggers.t_frozen = ocfs2_db_frozen_trigger; + break; + case OCFS2_JTR_XB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_xattr_block, xb_check); + break; + case OCFS2_JTR_DQ: + ot->ot_triggers.t_frozen = ocfs2_dq_frozen_trigger; + break; + case OCFS2_JTR_DR: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check); + break; + case OCFS2_JTR_DL: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check); + break; + case OCFS2_JTR_NONE: + /* To make compiler happy... */ + return; + } -static struct ocfs2_triggers dq_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_dq_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, -}; + ot->ot_triggers.t_abort = ocfs2_abort_trigger; + ot->sb = sb; +} -static struct ocfs2_triggers dr_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), -}; +void ocfs2_initialize_journal_triggers(struct super_block *sb, + struct ocfs2_triggers triggers[]) +{ + enum ocfs2_journal_trigger_type type; -static struct ocfs2_triggers dl_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), -}; + for (type = OCFS2_JTR_DI; type < OCFS2_JOURNAL_TRIGGER_COUNT; type++) + ocfs2_setup_csum_triggers(sb, type, &triggers[type]); +} static int __ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, @@ -708,56 +710,91 @@ static int __ocfs2_journal_access(handle_t *handle, int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DI], + type); } int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_EB], + type); } int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &rb_triggers, + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_RB], type); } int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_GD], + type); } int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DB], + type); } int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_XB], + type); } int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DQ], + type); } int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DR], + type); } int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DL], + type); } int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, @@ -778,13 +815,15 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) if (!is_handle_aborted(handle)) { journal_t *journal = handle->h_transaction->t_journal; - mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. " - "Aborting transaction and journal.\n"); + mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed: " + "handle type %u started at line %u, credits %u/%u " + "errcode %d. Aborting transaction and journal.\n", + handle->h_type, handle->h_line_no, + handle->h_requested_credits, + jbd2_handle_buffer_credits(handle), status); handle->h_err = status; jbd2_journal_abort_handle(handle); jbd2_journal_abort(journal, status); - ocfs2_abort(bh->b_assoc_map->host->i_sb, - "Journal already aborted.\n"); } } } diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 41c9fe7e62f9..e3c3a35dc5e0 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -243,6 +243,8 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle); int ocfs2_extend_trans(handle_t *handle, int nblocks); +int ocfs2_assure_trans_credits(handle_t *handle, + int nblocks); int ocfs2_allocate_extend_trans(handle_t *handle, int thresh); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index a503c553bab2..8fe826143d7b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -284,6 +284,30 @@ enum ocfs2_mount_options #define OCFS2_OSB_ERROR_FS 0x0004 #define OCFS2_DEFAULT_ATIME_QUANTUM 60 +struct ocfs2_triggers { + struct jbd2_buffer_trigger_type ot_triggers; + int ot_offset; + struct super_block *sb; +}; + +enum ocfs2_journal_trigger_type { + OCFS2_JTR_DI, + OCFS2_JTR_EB, + OCFS2_JTR_RB, + OCFS2_JTR_GD, + OCFS2_JTR_DB, + OCFS2_JTR_XB, + OCFS2_JTR_DQ, + OCFS2_JTR_DR, + OCFS2_JTR_DL, + OCFS2_JTR_NONE /* This must be the last entry */ +}; + +#define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE + +void ocfs2_initialize_journal_triggers(struct super_block *sb, + struct ocfs2_triggers triggers[]); + struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; @@ -351,6 +375,9 @@ struct ocfs2_super struct ocfs2_journal *journal; unsigned long osb_commit_interval; + /* Journal triggers for checksum */ + struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT]; + struct delayed_work la_enable_wq; /* diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 60e208b01c8d..0511c69c9fde 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2577,6 +2577,8 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); +DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits); + DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8aabaed2c1cb..afee70125ae3 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1075,9 +1075,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, osb, &ocfs2_osb_debug_fops); - if (ocfs2_meta_ecc(osb)) + if (ocfs2_meta_ecc(osb)) { + ocfs2_initialize_journal_triggers(sb, osb->s_journal_triggers); ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, osb->osb_debug_root); + } status = ocfs2_mount_volume(sb); if (status < 0) diff --git a/fs/open.c b/fs/open.c index 89cafb572061..278b3edcda44 100644 --- a/fs/open.c +++ b/fs/open.c @@ -202,13 +202,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) return error; } -SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) +SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) +COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length) { return do_sys_ftruncate(fd, length, 1); } @@ -1004,11 +1004,6 @@ static int do_dentry_open(struct file *f, } } - /* - * Once we return a file with FMODE_OPENED, __fput() will call - * fsnotify_close(), so we need fsnotify_open() here for symmetry. - */ - fsnotify_open(f); return 0; cleanup_all: @@ -1085,8 +1080,19 @@ EXPORT_SYMBOL(file_path); */ int vfs_open(const struct path *path, struct file *file) { + int ret; + file->f_path = *path; - return do_dentry_open(file, NULL); + ret = do_dentry_open(file, NULL); + if (!ret) { + /* + * Once we return a file with FMODE_OPENED, __fput() will call + * fsnotify_close(), so we need fsnotify_open() here for + * symmetry. + */ + fsnotify_open(file); + } + return ret; } struct file *dentry_open(const struct path *path, int flags, @@ -1177,8 +1183,10 @@ struct file *kernel_file_open(const struct path *path, int flags, error = do_dentry_open(f, NULL); if (error) { fput(f); - f = ERR_PTR(error); + return ERR_PTR(error); } + + fsnotify_open(f); return f; } EXPORT_SYMBOL_GPL(kernel_file_open); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 116f542442dd..ab65e98a1def 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1314,10 +1314,6 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, int flags = file->f_flags | OVL_OPEN_FLAGS; int err; - err = ovl_copy_up(dentry->d_parent); - if (err) - return err; - old_cred = ovl_override_creds(dentry->d_sb); err = ovl_setup_cred_for_create(dentry, inode, mode, old_cred); if (err) @@ -1360,6 +1356,10 @@ static int ovl_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (!OVL_FS(dentry->d_sb)->tmpfile) return -EOPNOTSUPP; + err = ovl_copy_up(dentry->d_parent); + if (err) + return err; + err = ovl_want_write(dentry); if (err) return err; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 063409069f56..5868cb222955 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -181,6 +181,10 @@ static int ovl_check_encode_origin(struct dentry *dentry) struct ovl_fs *ofs = OVL_FS(dentry->d_sb); bool decodable = ofs->config.nfs_export; + /* No upper layer? */ + if (!ovl_upper_mnt(ofs)) + return 1; + /* Lower file handle for non-upper non-decodable */ if (!ovl_dentry_upper(dentry) && !decodable) return 1; @@ -209,7 +213,7 @@ static int ovl_check_encode_origin(struct dentry *dentry) * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ - if (d_is_dir(dentry) && ovl_upper_mnt(ofs) && decodable) + if (d_is_dir(dentry) && decodable) return ovl_connect_layer(dentry); /* Lower file handle for indexed and non-upper dir/non-dir */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 18550c071d71..72a1acd03675 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3214,7 +3214,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); - seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages); + seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f8d35f993fe5..71e5039d940d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -707,6 +707,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_X86_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif +#ifdef CONFIG_64BIT + [ilog2(VM_SEALED)] = "sl", +#endif }; size_t i; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index bb86fc0641d8..6397fdefd876 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -134,7 +134,7 @@ module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); module_param(enable_gcm_256, bool, 0644); -MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: n/N/0"); +MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/0"); module_param(require_gcm_256, bool, 0644); MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0"); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 73482734a8d8..557b68e99d0a 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1494,6 +1494,8 @@ struct cifs_aio_ctx { struct cifs_io_request { struct netfs_io_request rreq; struct cifsFileInfo *cfile; + struct TCP_Server_Info *server; + pid_t pid; }; /* asynchronous read support */ @@ -1504,7 +1506,6 @@ struct cifs_io_subrequest { struct cifs_io_request *req; }; ssize_t got_bytes; - pid_t pid; unsigned int xid; int result; bool have_xid; diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 25e9ab947c17..595c4b673707 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1345,8 +1345,8 @@ cifs_async_readv(struct cifs_io_subrequest *rdata) if (rc) return rc; - smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid); - smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16)); + smb->hdr.Pid = cpu_to_le16((__u16)rdata->req->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->req->pid >> 16)); smb->AndXCommand = 0xFF; /* none */ smb->Fid = rdata->req->cfile->fid.netfid; @@ -1689,8 +1689,8 @@ cifs_async_writev(struct cifs_io_subrequest *wdata) if (rc) goto async_writev_out; - smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid); - smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16)); + smb->hdr.Pid = cpu_to_le16((__u16)wdata->req->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->req->pid >> 16)); smb->AndXCommand = 0xFF; /* none */ smb->Fid = wdata->req->cfile->fid.netfid; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 9d5c2440abfc..1374635e89fa 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -134,17 +134,15 @@ fail: static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; - struct TCP_Server_Info *server; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); + struct TCP_Server_Info *server = req->server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); size_t rsize = 0; int rc; rdata->xid = get_xid(); rdata->have_xid = true; - - server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; if (cifs_sb->ctx->rsize == 0) @@ -179,15 +177,8 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); - pid_t pid; int rc = 0; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) - pid = req->cfile->pid; - else - pid = current->tgid; // Ummm... This may be a workqueue - cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", __func__, rreq->debug_id, subreq->debug_index, rreq->mapping, subreq->transferred, subreq->len); @@ -201,16 +192,8 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) } __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - rdata->pid = pid; - - rc = adjust_credits(rdata->server, &rdata->credits, rdata->subreq.len); - if (!rc) { - if (rdata->req->cfile->invalidHandle) - rc = -EAGAIN; - else - rc = rdata->server->ops->async_readv(rdata); - } + rc = rdata->server->ops->async_readv(rdata); out: if (rc) netfs_subreq_terminated(subreq, rc, false); @@ -245,11 +228,15 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) rreq->rsize = cifs_sb->ctx->rsize; rreq->wsize = cifs_sb->ctx->wsize; + req->pid = current->tgid; // Ummm... This may be a workqueue if (file) { open_file = file->private_data; rreq->netfs_priv = file->private_data; req->cfile = cifsFileInfo_get(open_file); + req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + req->pid = req->cfile->pid; } else if (rreq->origin != NETFS_WRITEBACK) { WARN_ON_ONCE(1); return -EIO; @@ -259,35 +246,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) } /* - * Expand the size of a readahead to the size of the rsize, if at least as - * large as a page, allowing for the possibility that rsize is not pow-2 - * aligned. - */ -static void cifs_expand_readahead(struct netfs_io_request *rreq) -{ - unsigned int rsize = rreq->rsize; - loff_t misalignment, i_size = i_size_read(rreq->inode); - - if (rsize < PAGE_SIZE) - return; - - if (rsize < INT_MAX) - rsize = roundup_pow_of_two(rsize); - else - rsize = ((unsigned int)INT_MAX + 1) / 2; - - misalignment = rreq->start & (rsize - 1); - if (misalignment) { - rreq->start -= misalignment; - rreq->len += misalignment; - } - - rreq->len = round_up(rreq->len, rsize); - if (rreq->start < i_size && rreq->len > i_size - rreq->start) - rreq->len = i_size - rreq->start; -} - -/* * Completion of a request operation. */ static void cifs_rreq_done(struct netfs_io_request *rreq) @@ -342,7 +300,6 @@ const struct netfs_request_ops cifs_req_ops = { .init_request = cifs_init_request, .free_request = cifs_free_request, .free_subrequest = cifs_free_subrequest, - .expand_readahead = cifs_expand_readahead, .clamp_length = cifs_clamp_length, .issue_read = cifs_req_issue_read, .done = cifs_rreq_done, @@ -3200,8 +3157,6 @@ static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { ssize_t ret; - WARN_ON_ONCE(iov_iter_count(iter) != PAGE_SIZE); - if (iov_iter_rw(iter) == READ) ret = netfs_unbuffered_read_iter_locked(iocb, iter); else diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 993ac36c3d58..2ae2dbb6202b 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4484,6 +4484,16 @@ smb2_new_read_req(void **buf, unsigned int *total_len, return rc; } +static void smb2_readv_worker(struct work_struct *work) +{ + struct cifs_io_subrequest *rdata = + container_of(work, struct cifs_io_subrequest, subreq.work); + + netfs_subreq_terminated(&rdata->subreq, + (rdata->result == 0 || rdata->result == -EAGAIN) ? + rdata->got_bytes : rdata->result, true); +} + static void smb2_readv_callback(struct mid_q_entry *mid) { @@ -4577,12 +4587,9 @@ smb2_readv_callback(struct mid_q_entry *mid) if (rdata->subreq.start < rdata->subreq.rreq->i_size) rdata->result = 0; } - if (rdata->result == 0 || rdata->result == -EAGAIN) - iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes); rdata->credits.value = 0; - netfs_subreq_terminated(&rdata->subreq, - (rdata->result == 0 || rdata->result == -EAGAIN) ? - rdata->got_bytes : rdata->result, true); + INIT_WORK(&rdata->subreq.work, smb2_readv_worker); + queue_work(cifsiod_wq, &rdata->subreq.work); release_mid(mid); add_credits(server, &credits, 0); } @@ -4614,7 +4621,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) io_parms.length = rdata->subreq.len; io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid; - io_parms.pid = rdata->pid; + io_parms.pid = rdata->req->pid; rc = smb2_new_read_req( (void **) &buf, &total_len, &io_parms, rdata, 0, 0); @@ -4789,7 +4796,6 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->result = -ENOSPC; else wdata->subreq.len = written; - iov_iter_advance(&wdata->subreq.io_iter, written); break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: @@ -4867,7 +4873,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) .length = wdata->subreq.len, .persistent_fid = wdata->req->cfile->fid.persistent_fid, .volatile_fid = wdata->req->cfile->fid.volatile_fid, - .pid = wdata->pid, + .pid = wdata->req->pid, }; io_parms = &_io_parms; diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 02135a605305..1476c445cadc 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -216,8 +216,8 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid) } tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid); if (!tcon) { - cifs_put_smb_ses(ses); spin_unlock(&cifs_tcp_ses_lock); + cifs_put_smb_ses(ses); return NULL; } spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index b6c5a8ea3887..e7e07891781b 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -630,6 +630,12 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls) return name; } + if (*name == '\\') { + pr_err("not allow directory name included leading slash\n"); + kfree(name); + return ERR_PTR(-EINVAL); + } + ksmbd_conv_path_to_unix(name); ksmbd_strip_last_slash(name); return name; @@ -2361,7 +2367,8 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, if (rc > 0) { rc = ksmbd_vfs_remove_xattr(idmap, path, - attr_name); + attr_name, + get_write); if (rc < 0) { ksmbd_debug(SMB, @@ -2376,7 +2383,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, } else { rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value, le16_to_cpu(eabuf->EaValueLength), - 0, true); + 0, get_write); if (rc < 0) { ksmbd_debug(SMB, "ksmbd_vfs_setxattr is failed(%d)\n", @@ -2468,7 +2475,7 @@ static int smb2_remove_smb_xattrs(const struct path *path) !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN)) { err = ksmbd_vfs_remove_xattr(idmap, path, - name); + name, true); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", name); @@ -2842,20 +2849,11 @@ int smb2_open(struct ksmbd_work *work) } if (req->NameLength) { - if ((req->CreateOptions & FILE_DIRECTORY_FILE_LE) && - *(char *)req->Buffer == '\\') { - pr_err("not allow directory name included leading slash\n"); - rc = -EINVAL; - goto err_out2; - } - name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset), le16_to_cpu(req->NameLength), work->conn->local_nls); if (IS_ERR(name)) { rc = PTR_ERR(name); - if (rc != -ENOMEM) - rc = -ENOENT; name = NULL; goto err_out2; } diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 51b1b0bed616..9e859ba010cf 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -1058,16 +1058,21 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, } int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - const struct path *path, char *attr_name) + const struct path *path, char *attr_name, + bool get_write) { int err; - err = mnt_want_write(path->mnt); - if (err) - return err; + if (get_write == true) { + err = mnt_want_write(path->mnt); + if (err) + return err; + } err = vfs_removexattr(idmap, path->dentry, attr_name); - mnt_drop_write(path->mnt); + + if (get_write == true) + mnt_drop_write(path->mnt); return err; } @@ -1380,7 +1385,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path) ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) { - err = ksmbd_vfs_remove_xattr(idmap, path, name); + err = ksmbd_vfs_remove_xattr(idmap, path, name, true); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", name); } diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index cfe1c8092f23..cb76f4b5bafe 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -114,7 +114,8 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - const struct path *path, char *attr_name); + const struct path *path, char *attr_name, + bool get_write); int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, unsigned int flags, struct path *parent_path, struct path *path, bool caseless); diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 6cb599cd287e..8b2e37c8716e 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -254,7 +254,8 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) ci->m_flags &= ~S_DEL_ON_CLS_STREAM; err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp), &filp->f_path, - fp->stream.name); + fp->stream.name, + true); if (err) pr_err("remove xattr failed : %s\n", fp->stream.name); diff --git a/fs/super.c b/fs/super.c index b72f1d288e95..095ba793e10c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1502,8 +1502,17 @@ static int fs_bdev_thaw(struct block_device *bdev) lockdep_assert_held(&bdev->bd_fsfreeze_mutex); + /* + * The block device may have been frozen before it was claimed by a + * filesystem. Concurrently another process might try to mount that + * frozen block device and has temporarily claimed the block device for + * that purpose causing a concurrent fs_bdev_thaw() to end up here. The + * mounter is already about to abort mounting because they still saw an + * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return + * NULL in that case. + */ sb = get_bdev_super(bdev); - if (WARN_ON_ONCE(!sb)) + if (!sb) return -EINVAL; if (sb->s_op->thaw_super) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c101cf266bc4..6af6f744fdd6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4058,20 +4058,32 @@ xfs_bmapi_reserve_delalloc( xfs_extlen_t indlen; uint64_t fdblocks; int error; - xfs_fileoff_t aoff = off; + xfs_fileoff_t aoff; + bool use_cowextszhint = + whichfork == XFS_COW_FORK && !prealloc; +retry: /* * Cap the alloc length. Keep track of prealloc so we know whether to * tag the inode before we return. */ + aoff = off; alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); if (prealloc && alen >= len) prealloc = alen - len; - /* Figure out the extent size, adjust alen */ - if (whichfork == XFS_COW_FORK) { + /* + * If we're targetting the COW fork but aren't creating a speculative + * posteof preallocation, try to expand the reservation to align with + * the COW extent size hint if there's sufficient free space. + * + * Unlike the data fork, the CoW cancellation functions will free all + * the reservations at inactivation, so we don't require that every + * delalloc reservation have a dirty pagecache. + */ + if (use_cowextszhint) { struct xfs_bmbt_irec prev; xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); @@ -4090,7 +4102,7 @@ xfs_bmapi_reserve_delalloc( */ error = xfs_quota_reserve_blkres(ip, alen); if (error) - return error; + goto out; /* * Split changing sb for alen and indlen since they could be coming @@ -4140,6 +4152,17 @@ out_unreserve_frextents: out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) xfs_quota_unreserve_blkres(ip, alen); +out: + if (error == -ENOSPC || error == -EDQUOT) { + trace_xfs_delalloc_enospc(ip, off, len); + + if (prealloc || use_cowextszhint) { + /* retry without any preallocation */ + use_cowextszhint = false; + prealloc = 0; + goto retry; + } + } return error; } diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 97996cb79aaa..454b63ef7201 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -996,7 +996,7 @@ struct xfs_getparents_by_handle { #define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom) #define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req) #define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req) -#define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exchange_range) +#define XFS_IOC_EXCHANGE_RANGE _IOW ('X', 129, struct xfs_exchange_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index e7a7bfbe75b4..513b50da6215 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -379,10 +379,13 @@ xfs_dinode_verify_fork( /* * A directory small enough to fit in the inode must be stored * in local format. The directory sf <-> extents conversion - * code updates the directory size accordingly. + * code updates the directory size accordingly. Directories + * being truncated have zero size and are not subject to this + * check. */ if (S_ISDIR(mode)) { - if (be64_to_cpu(dip->di_size) <= fork_size && + if (dip->di_size && + be64_to_cpu(dip->di_size) <= fork_size && fork_format != XFS_DINODE_FMT_LOCAL) return __this_address; } @@ -528,9 +531,19 @@ xfs_dinode_verify( if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) return __this_address; - /* No zero-length symlinks/dirs. */ - if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) - return __this_address; + /* + * No zero-length symlinks/dirs unless they're unlinked and hence being + * inactivated. + */ + if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) { + if (dip->di_version > 1) { + if (dip->di_nlink) + return __this_address; + } else { + if (dip->di_onlink) + return __this_address; + } + } fa = xfs_dinode_verify_nrext64(mp, dip); if (fa) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 09e4bf949bf8..6b56f0f6d4c1 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1038,11 +1038,12 @@ xfs_log_sb( * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { - mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount); mp->m_sb.sb_ifree = min_t(uint64_t, - percpu_counter_sum(&mp->m_ifree), + percpu_counter_sum_positive(&mp->m_ifree), mp->m_sb.sb_icount); - mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + mp->m_sb.sb_fdblocks = + percpu_counter_sum_positive(&mp->m_fdblocks); } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index ac2e77ebb54c..a4d9fbc21b83 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -486,13 +486,11 @@ out_unlock: /* * Test whether it is appropriate to check an inode for and free post EOF - * blocks. The 'force' parameter determines whether we should also consider - * regular files that are marked preallocated or append-only. + * blocks. */ bool xfs_can_free_eofblocks( - struct xfs_inode *ip, - bool force) + struct xfs_inode *ip) { struct xfs_bmbt_irec imap; struct xfs_mount *mp = ip->i_mount; @@ -526,11 +524,11 @@ xfs_can_free_eofblocks( return false; /* - * Do not free real preallocated or append-only files unless the file - * has delalloc blocks and we are forced to remove them. + * Only free real extents for inodes with persistent preallocations or + * the append-only flag. */ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) - if (!force || ip->i_delayed_blks == 0) + if (ip->i_delayed_blks == 0) return false; /* @@ -584,6 +582,22 @@ xfs_free_eofblocks( /* Wait on dio to ensure i_size has settled. */ inode_dio_wait(VFS_I(ip)); + /* + * For preallocated files only free delayed allocations. + * + * Note that this means we also leave speculative preallocations in + * place for preallocated files. + */ + if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) { + if (ip->i_delayed_blks) { + xfs_bmap_punch_delalloc_range(ip, + round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), + LLONG_MAX); + } + xfs_inode_clear_eofblocks_tag(ip); + return 0; + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(xfs_is_shutdown(mp)); @@ -891,7 +905,7 @@ xfs_prepare_shift( * Trim eofblocks to avoid shifting uninitialized post-eof preallocation * into the accessible region of the file. */ - if (xfs_can_free_eofblocks(ip, true)) { + if (xfs_can_free_eofblocks(ip)) { error = xfs_free_eofblocks(ip); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 51f84d8ff372..eb0895bfb9da 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -63,7 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, xfs_off_t len); /* EOF block manipulation functions */ -bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); +bool xfs_can_free_eofblocks(struct xfs_inode *ip); int xfs_free_eofblocks(struct xfs_inode *ip); int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0953163a2d84..9967334ea99f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1155,7 +1155,7 @@ xfs_inode_free_eofblocks( } *lockflags |= XFS_IOLOCK_EXCL; - if (xfs_can_free_eofblocks(ip, false)) + if (xfs_can_free_eofblocks(ip)) return xfs_free_eofblocks(ip); /* inode could be preallocated or append-only */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 58fb7a5062e1..a4e3cd8971fc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -42,6 +42,7 @@ #include "xfs_pnfs.h" #include "xfs_parent.h" #include "xfs_xattr.h" +#include "xfs_sb.h" struct kmem_cache *xfs_inode_cache; @@ -870,9 +871,16 @@ xfs_init_new_inode( * this saves us from needing to run a separate transaction to set the * fork offset in the immediate future. */ - if (init_xattrs && xfs_has_attr(mp)) { + if (init_xattrs) { ip->i_forkoff = xfs_default_attroffset(ip) >> 3; xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); + + if (!xfs_has_attr(mp)) { + spin_lock(&mp->m_sb_lock); + xfs_add_attr(mp); + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); + } } /* @@ -1595,7 +1603,7 @@ xfs_release( if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) return 0; - if (xfs_can_free_eofblocks(ip, false)) { + if (xfs_can_free_eofblocks(ip)) { /* * Check if the inode is being opened, written and closed * frequently and we have delayed allocation blocks outstanding @@ -1856,15 +1864,13 @@ xfs_inode_needs_inactive( /* * This file isn't being freed, so check if there are post-eof blocks - * to free. @force is true because we are evicting an inode from the - * cache. Post-eof blocks must be freed, lest we end up with broken - * free space accounting. + * to free. * * Note: don't bother with iolock here since lockdep complains about * acquiring it in reclaim context. We have the only reference to the * inode at this point anyways. */ - return xfs_can_free_eofblocks(ip, true); + return xfs_can_free_eofblocks(ip); } /* @@ -1947,15 +1953,11 @@ xfs_inactive( if (VFS_I(ip)->i_nlink != 0) { /* - * force is true because we are evicting an inode from the - * cache. Post-eof blocks must be freed, lest we end up with - * broken free space accounting. - * * Note: don't bother with iolock here since lockdep complains * about acquiring it in reclaim context. We have the only * reference to the inode at this point anyways. */ - if (xfs_can_free_eofblocks(ip, true)) + if (xfs_can_free_eofblocks(ip)) error = xfs_free_eofblocks(ip); goto out; @@ -2548,11 +2550,26 @@ xfs_ifree_cluster( * This buffer may not have been correctly initialised as we * didn't read it from disk. That's not important because we are * only using to mark the buffer as stale in the log, and to - * attach stale cached inodes on it. That means it will never be - * dispatched for IO. If it is, we want to know about it, and we - * want it to fail. We can acheive this by adding a write - * verifier to the buffer. + * attach stale cached inodes on it. + * + * For the inode that triggered the cluster freeing, this + * attachment may occur in xfs_inode_item_precommit() after we + * have marked this buffer stale. If this buffer was not in + * memory before xfs_ifree_cluster() started, it will not be + * marked XBF_DONE and this will cause problems later in + * xfs_inode_item_precommit() when we trip over a (stale, !done) + * buffer to attached to the transaction. + * + * Hence we have to mark the buffer as XFS_DONE here. This is + * safe because we are also marking the buffer as XBF_STALE and + * XFS_BLI_STALE. That means it will never be dispatched for + * IO and it won't be unlocked until the cluster freeing has + * been committed to the journal and the buffer unpinned. If it + * is written, we want to know about it, and we want it to + * fail. We can acheive this by adding a write verifier to the + * buffer. */ + bp->b_flags |= XBF_DONE; bp->b_ops = &xfs_inode_buf_ops; /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 378342673925..414903885ab9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1148,33 +1148,23 @@ xfs_buffered_write_iomap_begin( } } -retry: - error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, - allocfork == XFS_DATA_FORK ? &imap : &cmap, - allocfork == XFS_DATA_FORK ? &icur : &ccur, - allocfork == XFS_DATA_FORK ? eof : cow_eof); - switch (error) { - case 0: - break; - case -ENOSPC: - case -EDQUOT: - /* retry without any preallocation */ - trace_xfs_delalloc_enospc(ip, offset, count); - if (prealloc_blocks) { - prealloc_blocks = 0; - goto retry; - } - fallthrough; - default: - goto out_unlock; - } - if (allocfork == XFS_COW_FORK) { + error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, &cmap, + &ccur, cow_eof); + if (error) + goto out_unlock; + trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap); goto found_cow; } + error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, &imap, &icur, + eof); + if (error) + goto out_unlock; + /* * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. |