summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2024-07-12 18:24:12 +0300
committerPaolo Bonzini <pbonzini@redhat.com>2024-07-12 18:24:12 +0300
commitc8b8b8190a80b591aa73c27c70a668799f8db547 (patch)
tree9d948c9aac89678abe64ac81f6c43348bf4b2091 /fs
parentf0a23883fad4ec8a63faddb9639a92be2e007624 (diff)
parent492ac37fa38faf520b5beae44c930063265ee183 (diff)
downloadlinux-c8b8b8190a80b591aa73c27c70a668799f8db547.tar.xz
Merge tag 'loongarch-kvm-6.11' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson into HEAD
LoongArch KVM changes for v6.11 1. Add ParaVirt steal time support. 2. Add some VM migration enhancement. 3. Add perf kvm-stat support for loongarch.
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/bcachefs/alloc_background.c311
-rw-r--r--fs/bcachefs/alloc_background.h14
-rw-r--r--fs/bcachefs/alloc_foreground.c4
-rw-r--r--fs/bcachefs/bcachefs.h24
-rw-r--r--fs/bcachefs/bcachefs_format.h13
-rw-r--r--fs/bcachefs/bkey.c2
-rw-r--r--fs/bcachefs/bkey_methods.c6
-rw-r--r--fs/bcachefs/bkey_methods.h3
-rw-r--r--fs/bcachefs/btree_cache.c9
-rw-r--r--fs/bcachefs/btree_gc.c17
-rw-r--r--fs/bcachefs/btree_io.c8
-rw-r--r--fs/bcachefs/btree_iter.c35
-rw-r--r--fs/bcachefs/btree_key_cache.c33
-rw-r--r--fs/bcachefs/btree_locking.c1
-rw-r--r--fs/bcachefs/btree_node_scan.c9
-rw-r--r--fs/bcachefs/btree_types.h16
-rw-r--r--fs/bcachefs/buckets.c293
-rw-r--r--fs/bcachefs/buckets.h17
-rw-r--r--fs/bcachefs/buckets_types.h2
-rw-r--r--fs/bcachefs/chardev.c9
-rw-r--r--fs/bcachefs/data_update.c3
-rw-r--r--fs/bcachefs/debug.c109
-rw-r--r--fs/bcachefs/ec.c26
-rw-r--r--fs/bcachefs/errcode.h3
-rw-r--r--fs/bcachefs/error.c19
-rw-r--r--fs/bcachefs/error.h7
-rw-r--r--fs/bcachefs/extents.c9
-rw-r--r--fs/bcachefs/fs-ioctl.c19
-rw-r--r--fs/bcachefs/fs.c24
-rw-r--r--fs/bcachefs/fsck.c3
-rw-r--r--fs/bcachefs/io_read.c37
-rw-r--r--fs/bcachefs/io_write.c19
-rw-r--r--fs/bcachefs/journal.c8
-rw-r--r--fs/bcachefs/journal_io.c20
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c2
-rw-r--r--fs/bcachefs/lru.h3
-rw-r--r--fs/bcachefs/move.c16
-rw-r--r--fs/bcachefs/movinggc.c7
-rw-r--r--fs/bcachefs/opts.h2
-rw-r--r--fs/bcachefs/recovery.c12
-rw-r--r--fs/bcachefs/sb-downgrade.c2
-rw-r--r--fs/bcachefs/sb-errors.c14
-rw-r--r--fs/bcachefs/sb-errors_format.h559
-rw-r--r--fs/bcachefs/seqmutex.h11
-rw-r--r--fs/bcachefs/snapshot.c12
-rw-r--r--fs/bcachefs/str_hash.h2
-rw-r--r--fs/bcachefs/super-io.c13
-rw-r--r--fs/bcachefs/super.c29
-rw-r--r--fs/btrfs/bio.c4
-rw-r--r--fs/btrfs/block-group.c20
-rw-r--r--fs/btrfs/btrfs_inode.h10
-rw-r--r--fs/btrfs/disk-io.c10
-rw-r--r--fs/btrfs/extent_io.c62
-rw-r--r--fs/btrfs/file.c16
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/btrfs/ordered-data.c31
-rw-r--r--fs/btrfs/qgroup.c14
-rw-r--r--fs/btrfs/ref-verify.c9
-rw-r--r--fs/btrfs/scrub.c24
-rw-r--r--fs/btrfs/space-info.c24
-rw-r--r--fs/btrfs/tree-log.c60
-rw-r--r--fs/cachefiles/daemon.c3
-rw-r--r--fs/cachefiles/internal.h5
-rw-r--r--fs/cachefiles/ondemand.c218
-rw-r--r--fs/dcache.c31
-rw-r--r--fs/debugfs/inode.c10
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/erofs/zmap.c2
-rw-r--r--fs/erofs/zutil.c8
-rw-r--r--fs/file.c4
-rw-r--r--fs/iomap/buffered-io.c56
-rw-r--r--fs/jfs/xattr.c4
-rw-r--r--fs/locks.c9
-rw-r--r--fs/namei.c10
-rw-r--r--fs/netfs/buffered_write.c12
-rw-r--r--fs/netfs/direct_write.c3
-rw-r--r--fs/netfs/internal.h9
-rw-r--r--fs/netfs/misc.c81
-rw-r--r--fs/netfs/write_issue.c2
-rw-r--r--fs/nfs/dir.c77
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/nfs4proc.c24
-rw-r--r--fs/nfs/pagelist.c5
-rw-r--r--fs/nfs/symlink.c2
-rw-r--r--fs/nfsd/netlink.c2
-rw-r--r--fs/nfsd/netlink.h3
-rw-r--r--fs/nfsd/nfsctl.c50
-rw-r--r--fs/nfsd/nfssvc.c1
-rw-r--r--fs/nilfs2/alloc.c19
-rw-r--r--fs/nilfs2/alloc.h4
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/dir.c8
-rw-r--r--fs/nilfs2/ifile.c7
-rw-r--r--fs/nilfs2/nilfs.h10
-rw-r--r--fs/nilfs2/segment.c3
-rw-r--r--fs/nilfs2/the_nilfs.c6
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/ocfs2/journal.c209
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/ocfs2.h27
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/open.c26
-rw-r--r--fs/overlayfs/dir.c8
-rw-r--r--fs/overlayfs/export.c6
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/smb/client/cifsfs.c2
-rw-r--r--fs/smb/client/cifsglob.h3
-rw-r--r--fs/smb/client/cifssmb.c8
-rw-r--r--fs/smb/client/file.c57
-rw-r--r--fs/smb/client/smb2pdu.c22
-rw-r--r--fs/smb/client/smb2transport.c2
-rw-r--r--fs/smb/server/smb2pdu.c22
-rw-r--r--fs/smb/server/vfs.c17
-rw-r--r--fs/smb/server/vfs.h3
-rw-r--r--fs/smb/server/vfs_cache.c3
-rw-r--r--fs/super.c11
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c31
-rw-r--r--fs/xfs/libxfs/xfs_fs.h2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c23
-rw-r--r--fs/xfs/libxfs/xfs_sb.c7
-rw-r--r--fs/xfs/xfs_bmap_util.c30
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_inode.c47
-rw-r--r--fs/xfs/xfs_iomap.c34
130 files changed, 2038 insertions, 1388 deletions
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 15bb7989c387..3acf5e050072 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -512,7 +512,7 @@ static int afs_iget5_set_root(struct inode *inode, void *opaque)
struct afs_vnode *vnode = AFS_FS_I(inode);
vnode->volume = as->volume;
- vnode->fid.vid = as->volume->vid,
+ vnode->fid.vid = as->volume->vid;
vnode->fid.vnode = 1;
vnode->fid.unique = 1;
inode->i_ino = 1;
@@ -545,7 +545,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
BUG_ON(!(inode->i_state & I_NEW));
vnode = AFS_FS_I(inode);
- vnode->cb_v_check = atomic_read(&as->volume->cb_v_break),
+ vnode->cb_v_check = atomic_read(&as->volume->cb_v_break);
afs_set_netfs_context(vnode);
op = afs_alloc_operation(key, as->volume);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 346cd91f91f9..1de9fac3bcf4 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -29,7 +29,7 @@
#include <linux/sched/task.h>
#include <linux/sort.h>
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
+static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
/* Persistent alloc info: */
@@ -259,6 +259,14 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
"invalid data type (got %u should be %u)",
a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+ for (unsigned i = 0; i < 2; i++)
+ bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
+ c, err,
+ alloc_key_io_time_bad,
+ "invalid io_time[%s]: %llu, max %llu",
+ i == READ ? "read" : "write",
+ a.v->io_time[i], LRU_TIME_MAX);
+
switch (a.v->data_type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
@@ -741,6 +749,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
@@ -756,8 +765,8 @@ int bch2_trigger_alloc(struct btree_trans *trans,
alloc_data_type_set(new_a, new_a->data_type);
if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+ new_a->io_time[READ] = bch2_current_io_time(c, READ);
+ new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
}
@@ -767,6 +776,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
new_a->gen++;
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+ alloc_data_type_set(new_a, new_a->data_type);
}
if (old_a->data_type != new_a->data_type ||
@@ -780,7 +790,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (new_a->data_type == BCH_DATA_cached &&
!new_a->io_time[READ])
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ new_a->io_time[READ] = bch2_current_io_time(c, READ);
u64 old_lru = alloc_lru_idx_read(*old_a);
u64 new_lru = alloc_lru_idx_read(*new_a);
@@ -860,8 +870,14 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
percpu_down_read(&c->mark_lock);
- if (new_a->gen != old_a->gen)
- *bucket_gen(ca, new.k->p.offset) = new_a->gen;
+ if (new_a->gen != old_a->gen) {
+ u8 *gen = bucket_gen(ca, new.k->p.offset);
+ if (unlikely(!gen)) {
+ percpu_up_read(&c->mark_lock);
+ goto invalid_bucket;
+ }
+ *gen = new_a->gen;
+ }
bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
percpu_up_read(&c->mark_lock);
@@ -875,14 +891,14 @@ int bch2_trigger_alloc(struct btree_trans *trans,
closure_wake_up(&c->freelist_wait);
if (statechange(a->data_type == BCH_DATA_need_discard) &&
- !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
+ !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
bucket_flushed(new_a))
- bch2_discard_one_bucket_fast(c, new.k->p);
+ bch2_discard_one_bucket_fast(ca, new.k->p.offset);
if (statechange(a->data_type == BCH_DATA_cached) &&
!bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
- bch2_do_invalidates(c);
+ bch2_dev_do_invalidates(ca);
if (statechange(a->data_type == BCH_DATA_need_gc_gens))
bch2_gc_gens_async(c);
@@ -895,6 +911,11 @@ int bch2_trigger_alloc(struct btree_trans *trans,
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, new.k->p.offset);
+ if (unlikely(!g)) {
+ percpu_up_read(&c->mark_lock);
+ goto invalid_bucket;
+ }
+ g->gen_valid = 1;
bucket_lock(g);
@@ -910,8 +931,14 @@ int bch2_trigger_alloc(struct btree_trans *trans,
percpu_up_read(&c->mark_lock);
}
err:
+ printbuf_exit(&buf);
bch2_dev_put(ca);
return ret;
+invalid_bucket:
+ bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
+ (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
+ ret = -EIO;
+ goto err;
}
/*
@@ -1561,7 +1588,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (ret)
goto err;
- a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+ a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
ret = bch2_trans_update(trans, alloc_iter,
&a_mut->k_i, BTREE_TRIGGER_norun);
if (ret)
@@ -1609,34 +1636,38 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
return ret;
}
-static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
+static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
{
int ret;
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i)
- if (bkey_eq(*i, bucket)) {
- ret = -EEXIST;
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i)
+ if (i->bucket == bucket) {
+ ret = -BCH_ERR_EEXIST_discard_in_flight_add;
goto out;
}
- ret = darray_push(&c->discard_buckets_in_flight, bucket);
+ ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
+ .in_progress = in_progress,
+ .bucket = bucket,
+ }));
out:
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
return ret;
}
-static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
+static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
{
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i)
- if (bkey_eq(*i, bucket)) {
- darray_remove_item(&c->discard_buckets_in_flight, i);
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i)
+ if (i->bucket == bucket) {
+ BUG_ON(!i->in_progress);
+ darray_remove_item(&ca->discard_buckets_in_flight, i);
goto found;
}
BUG();
found:
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
}
struct discard_buckets_state {
@@ -1644,26 +1675,11 @@ struct discard_buckets_state {
u64 open;
u64 need_journal_commit;
u64 discarded;
- struct bch_dev *ca;
u64 need_journal_commit_this_dev;
};
-static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
-{
- if (s->ca == ca)
- return;
-
- if (s->ca && s->need_journal_commit_this_dev >
- bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
- bch2_journal_flush_async(&c->journal, NULL);
-
- if (s->ca)
- percpu_ref_put(&s->ca->io_ref);
- s->ca = ca;
- s->need_journal_commit_this_dev = 0;
-}
-
static int bch2_discard_one_bucket(struct btree_trans *trans,
+ struct bch_dev *ca,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
struct discard_buckets_state *s)
@@ -1677,16 +1693,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
bool discard_locked = false;
int ret = 0;
- struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode
- ? s->ca
- : bch2_dev_get_ioref(c, pos.inode, WRITE);
- if (!ca) {
- bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
- return 0;
- }
-
- discard_buckets_next_dev(c, s, ca);
-
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
s->open++;
goto out;
@@ -1746,7 +1752,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
goto out;
}
- if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
+ if (discard_in_flight_add(ca, iter.pos.offset, true))
goto out;
discard_locked = true;
@@ -1770,8 +1776,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
}
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
- alloc_data_type_set(&a->v, a->v.data_type);
write:
+ alloc_data_type_set(&a->v, a->v.data_type);
+
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
@@ -1783,7 +1790,7 @@ write:
s->discarded++;
out:
if (discard_locked)
- discard_in_flight_remove(c, iter.pos);
+ discard_in_flight_remove(ca, iter.pos.offset);
s->seen++;
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
@@ -1792,7 +1799,8 @@ out:
static void bch2_do_discards_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
+ struct bch_fs *c = ca->fs;
struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
int ret;
@@ -1803,23 +1811,41 @@ static void bch2_do_discards_work(struct work_struct *work)
* successful commit:
*/
ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter,
- BTREE_ID_need_discard, POS_MIN, 0, k,
- bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
-
- discard_buckets_next_dev(c, &s, NULL);
+ for_each_btree_key_upto(trans, iter,
+ BTREE_ID_need_discard,
+ POS(ca->dev_idx, 0),
+ POS(ca->dev_idx, U64_MAX), 0, k,
+ bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_dev_do_discards(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->discard_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
void bch2_do_discards(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
- !queue_work(c->write_ref_wq, &c->discard_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ for_each_member_device(c, ca)
+ bch2_dev_do_discards(ca);
}
static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
@@ -1848,68 +1874,69 @@ err:
static void bch2_do_discards_fast_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
+ struct bch_fs *c = ca->fs;
while (1) {
bool got_bucket = false;
- struct bpos bucket;
- struct bch_dev *ca;
+ u64 bucket;
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i) {
- if (i->snapshot)
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i) {
+ if (i->in_progress)
continue;
- ca = bch2_dev_get_ioref(c, i->inode, WRITE);
- if (!ca) {
- darray_remove_item(&c->discard_buckets_in_flight, i);
- continue;
- }
-
got_bucket = true;
- bucket = *i;
- i->snapshot = true;
+ bucket = i->bucket;
+ i->in_progress = true;
break;
}
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
if (!got_bucket)
break;
if (ca->mi.discard && !c->opts.nochanges)
blkdev_issue_discard(ca->disk_sb.bdev,
- bucket.offset * ca->mi.bucket_size,
+ bucket_to_sector(ca, bucket),
ca->mi.bucket_size,
GFP_KERNEL);
int ret = bch2_trans_do(c, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc,
- bch2_clear_bucket_needs_discard(trans, bucket));
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
bch_err_fn(c, ret);
- percpu_ref_put(&ca->io_ref);
- discard_in_flight_remove(c, bucket);
+ discard_in_flight_remove(ca, bucket);
if (ret)
break;
}
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ percpu_ref_put(&ca->io_ref);
}
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
+static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
{
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode);
- bool dead = !ca || percpu_ref_is_dying(&ca->io_ref);
- rcu_read_unlock();
+ struct bch_fs *c = ca->fs;
+
+ if (discard_in_flight_add(ca, bucket, false))
+ return;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
- if (!dead &&
- !discard_in_flight_add(c, bucket) &&
- bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
- !queue_work(c->write_ref_wq, &c->discard_fast_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
static int invalidate_one_bucket(struct btree_trans *trans,
@@ -1957,8 +1984,8 @@ static int invalidate_one_bucket(struct btree_trans *trans,
a->v.data_type = 0;
a->v.dirty_sectors = 0;
a->v.cached_sectors = 0;
- a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
- a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
+ a->v.io_time[READ] = bch2_current_io_time(c, READ);
+ a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
ret = bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
@@ -1993,9 +2020,25 @@ err:
goto out;
}
+static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
+ struct bch_dev *ca, bool *wrapped)
+{
+ struct bkey_s_c k;
+again:
+ k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
+ if (!k.k && !*wrapped) {
+ bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
+ *wrapped = true;
+ goto again;
+ }
+
+ return k;
+}
+
static void bch2_do_invalidates_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
+ struct bch_fs *c = ca->fs;
struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
@@ -2003,31 +2046,63 @@ static void bch2_do_invalidates_work(struct work_struct *work)
if (ret)
goto err;
- for_each_member_device(c, ca) {
- s64 nr_to_invalidate =
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+ s64 nr_to_invalidate =
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+ struct btree_iter iter;
+ bool wrapped = false;
- ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
- lru_pos(ca->dev_idx, 0, 0),
- lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
- BTREE_ITER_intent, k,
- invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+ lru_pos(ca->dev_idx, 0,
+ ((bch2_current_io_time(c, READ) + U32_MAX) &
+ LRU_TIME_MAX)), 0);
- if (ret < 0) {
- bch2_dev_put(ca);
+ while (true) {
+ bch2_trans_begin(trans);
+
+ struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
break;
- }
+ if (!k.k)
+ break;
+
+ ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+ if (ret)
+ break;
+
+ bch2_btree_iter_advance(&iter);
}
+ bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_dev_do_invalidates(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->invalidate_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
void bch2_do_invalidates(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
- !queue_work(c->write_ref_wq, &c->invalidate_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ for_each_member_device(c, ca)
+ bch2_dev_do_invalidates(ca);
}
int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
@@ -2186,7 +2261,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
if (ret)
return ret;
- now = atomic64_read(&c->io_clock[rw].now);
+ now = bch2_current_io_time(c, rw);
if (a->v.io_time[rw] == now)
goto out;
@@ -2343,16 +2418,20 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
-void bch2_fs_allocator_background_exit(struct bch_fs *c)
+void bch2_dev_allocator_background_exit(struct bch_dev *ca)
+{
+ darray_exit(&ca->discard_buckets_in_flight);
+}
+
+void bch2_dev_allocator_background_init(struct bch_dev *ca)
{
- darray_exit(&c->discard_buckets_in_flight);
+ mutex_init(&ca->discard_buckets_in_flight_lock);
+ INIT_WORK(&ca->discard_work, bch2_do_discards_work);
+ INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
+ INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
}
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
- mutex_init(&c->discard_buckets_in_flight_lock);
- INIT_WORK(&c->discard_work, bch2_do_discards_work);
- INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
- INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index ae31a94be6f9..ba2c5557a3f0 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -141,7 +141,13 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
!bch2_bucket_sectors_fragmented(ca, a))
return 0;
- u64 d = bch2_bucket_sectors_dirty(a);
+ /*
+ * avoid overflowing LRU_TIME_BITS on a corrupted fs, when
+ * bucket_sectors_dirty is (much) bigger than bucket_size
+ */
+ u64 d = min(bch2_bucket_sectors_dirty(a),
+ ca->mi.bucket_size);
+
return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
}
@@ -269,6 +275,7 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
enum btree_iter_update_trigger_flags);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_dev_do_discards(struct bch_dev *);
void bch2_do_discards(struct bch_fs *);
static inline u64 should_invalidate_buckets(struct bch_dev *ca,
@@ -283,6 +290,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
}
+void bch2_dev_do_invalidates(struct bch_dev *);
void bch2_do_invalidates(struct bch_fs *);
static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
@@ -306,7 +314,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-void bch2_fs_allocator_background_exit(struct bch_fs *);
+void bch2_dev_allocator_background_exit(struct bch_dev *);
+void bch2_dev_allocator_background_init(struct bch_dev *);
+
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 927a5f300b30..9d3d64746a5b 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -621,13 +621,13 @@ again:
avail = dev_buckets_free(ca, *usage, watermark);
if (usage->d[BCH_DATA_need_discard].buckets > avail)
- bch2_do_discards(c);
+ bch2_dev_do_discards(ca);
if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
bch2_gc_gens_async(c);
if (should_invalidate_buckets(ca, *usage))
- bch2_do_invalidates(c);
+ bch2_dev_do_invalidates(ca);
if (!avail) {
if (cl && !waiting) {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 2a538eb2af11..1106fec6e155 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -493,6 +493,11 @@ struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
+struct discard_in_flight {
+ bool in_progress:1;
+ u64 bucket:63;
+};
+
struct bch_dev {
struct kobject kobj;
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -554,6 +559,12 @@ struct bch_dev {
size_t inc_gen_really_needs_gc;
size_t buckets_waiting_on_journal;
+ struct work_struct invalidate_work;
+ struct work_struct discard_work;
+ struct mutex discard_buckets_in_flight_lock;
+ DARRAY(struct discard_in_flight) discard_buckets_in_flight;
+ struct work_struct discard_fast_work;
+
atomic64_t rebalance_work;
struct journal_device journal;
@@ -790,7 +801,8 @@ struct bch_fs {
/* BTREE CACHE */
struct bio_set btree_bio;
- struct workqueue_struct *io_complete_wq;
+ struct workqueue_struct *btree_read_complete_wq;
+ struct workqueue_struct *btree_write_submit_wq;
struct btree_root btree_roots_known[BTREE_ID_NR];
DARRAY(struct btree_root) btree_roots_extra;
@@ -914,11 +926,6 @@ struct bch_fs {
unsigned write_points_nr;
struct buckets_waiting_for_journal buckets_waiting_for_journal;
- struct work_struct invalidate_work;
- struct work_struct discard_work;
- struct mutex discard_buckets_in_flight_lock;
- DARRAY(struct bpos) discard_buckets_in_flight;
- struct work_struct discard_fast_work;
/* GARBAGE COLLECTION */
struct work_struct gc_gens_work;
@@ -1213,6 +1220,11 @@ static inline s64 bch2_current_time(const struct bch_fs *c)
return timespec_to_bch2_time(c, now);
}
+static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw)
+{
+ return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX);
+}
+
static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
{
struct stdio_redirect *stdio = c->stdio;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 90c12fe2a2cd..e3b1bde489c3 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -476,6 +476,9 @@ struct bch_lru {
#define LRU_ID_STRIPES (1U << 16)
+#define LRU_TIME_BITS 48
+#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -987,8 +990,9 @@ enum bch_version_upgrade_opts {
#define BCH_ERROR_ACTIONS() \
x(continue, 0) \
- x(ro, 1) \
- x(panic, 2)
+ x(fix_safe, 1) \
+ x(panic, 2) \
+ x(ro, 3)
enum bch_error_actions {
#define x(t, n) BCH_ON_ERROR_##t = n,
@@ -1382,9 +1386,10 @@ enum btree_id {
/*
* Maximum number of btrees that we will _ever_ have under the current scheme,
- * where we refer to them with bitfields
+ * where we refer to them with 64 bit bitfields - and we also need a bit for
+ * the interior btree node type:
*/
-#define BTREE_ID_NR_MAX 64
+#define BTREE_ID_NR_MAX 63
static inline bool btree_id_is_alloc(enum btree_id id)
{
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index f46978e5cb7c..94a1d1982fa8 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -1064,7 +1064,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
{
const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
u8 *l = k->key_start;
- u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+ u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1;
while (l < h) {
swap(*l, *h);
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index c2c3dae52186..bd32aac05192 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -398,8 +398,12 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
for (i = 0; i < nr_compat; i++)
switch (!write ? i : nr_compat - 1 - i) {
case 0:
- if (big_endian != CPU_BIG_ENDIAN)
+ if (big_endian != CPU_BIG_ENDIAN) {
+ bch2_bkey_swab_key(f, k);
+ } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
bch2_bkey_swab_key(f, k);
+ bch2_bkey_swab_key(f, k);
+ }
break;
case 1:
if (version < bcachefs_metadata_version_bkey_renumber)
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 726ef7483763..baef0722f5fb 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -129,7 +129,8 @@ static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
struct bkey_packed *k)
{
if (version < bcachefs_metadata_version_current ||
- big_endian != CPU_BIG_ENDIAN)
+ big_endian != CPU_BIG_ENDIAN ||
+ IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
__bch2_bkey_compat(level, btree_id, version,
big_endian, write, f, k);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 9e4ed75d3675..4f5e411771ba 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -91,10 +91,11 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
}
static const struct rhashtable_params bch_btree_cache_params = {
- .head_offset = offsetof(struct btree, hash),
- .key_offset = offsetof(struct btree, hash_val),
- .key_len = sizeof(u64),
- .obj_cmpfn = bch2_btree_cache_cmp_fn,
+ .head_offset = offsetof(struct btree, hash),
+ .key_offset = offsetof(struct btree, hash_val),
+ .key_len = sizeof(u64),
+ .obj_cmpfn = bch2_btree_cache_cmp_fn,
+ .automatic_shrinking = true,
};
static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index dc97991bcd6a..0e477a926579 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -874,6 +874,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
const struct bch_alloc_v4 *old;
int ret;
+ if (!bucket_valid(ca, k.k->p.offset))
+ return 0;
+
old = bch2_alloc_to_v4(k, &old_convert);
gc = new = *old;
@@ -990,6 +993,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
buckets->first_bucket = ca->mi.first_bucket;
buckets->nbuckets = ca->mi.nbuckets;
+ buckets->nbuckets_minus_first =
+ buckets->nbuckets - buckets->first_bucket;
rcu_assign_pointer(ca->buckets_gc, buckets);
}
@@ -1003,12 +1008,14 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
continue;
}
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+ if (bucket_valid(ca, k.k->p.offset)) {
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
- struct bucket *g = gc_bucket(ca, k.k->p.offset);
- g->gen_valid = 1;
- g->gen = a->gen;
+ struct bucket *g = gc_bucket(ca, k.k->p.offset);
+ g->gen_valid = 1;
+ g->gen = a->gen;
+ }
0;
})));
bch2_dev_put(ca);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 829c1b91477d..7bca15c604f5 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1389,7 +1389,7 @@ static void btree_node_read_endio(struct bio *bio)
bch2_latency_acct(ca, rb->start_time, READ);
}
- queue_work(c->io_complete_wq, &rb->work);
+ queue_work(c->btree_read_complete_wq, &rb->work);
}
struct btree_node_read_all {
@@ -1656,7 +1656,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
btree_node_read_all_replicas_done(&ra->cl.work);
} else {
continue_at(&ra->cl, btree_node_read_all_replicas_done,
- c->io_complete_wq);
+ c->btree_read_complete_wq);
}
return 0;
@@ -1737,7 +1737,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
if (sync)
btree_node_read_work(&rb->work);
else
- queue_work(c->io_complete_wq, &rb->work);
+ queue_work(c->btree_read_complete_wq, &rb->work);
}
}
@@ -2229,7 +2229,7 @@ do_write:
atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
INIT_WORK(&wbio->work, btree_write_submit);
- queue_work(c->io_complete_wq, &wbio->work);
+ queue_work(c->btree_write_submit_wq, &wbio->work);
return;
err:
set_btree_node_noevict(b);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d3bcb4e4e230..0ed9e6574fcd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -221,11 +221,8 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
struct btree_path *path)
{
struct bch_fs *c = trans->c;
- unsigned i;
-
- EBUG_ON(path->btree_id >= BTREE_ID_NR);
- for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+ for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
if (!path->l[i].b) {
BUG_ON(!path->cached &&
bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
@@ -251,8 +248,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
- BUG_ON(iter->btree_id >= BTREE_ID_NR);
-
BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
@@ -3135,7 +3130,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
memset(trans, 0, sizeof(*trans));
- closure_init_stack(&trans->ref);
seqmutex_lock(&c->btree_trans_lock);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
@@ -3155,15 +3149,10 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
BUG_ON(pos_task &&
pid == pos_task->pid &&
pos->locked);
-
- if (pos_task && pid < pos_task->pid) {
- list_add_tail(&trans->list, &pos->list);
- goto list_add_done;
- }
}
}
- list_add_tail(&trans->list, &c->btree_trans_list);
-list_add_done:
+
+ list_add(&trans->list, &c->btree_trans_list);
seqmutex_unlock(&c->btree_trans_lock);
got_trans:
trans->c = c;
@@ -3204,6 +3193,8 @@ got_trans:
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
trans->srcu_held = true;
+
+ closure_init_stack_release(&trans->ref);
return trans;
}
@@ -3240,7 +3231,6 @@ void bch2_trans_put(struct btree_trans *trans)
trans_for_each_update(trans, i)
__btree_path_put(trans->paths + i->path, true);
trans->nr_updates = 0;
- trans->locking_wait.task = NULL;
check_btree_paths_leaked(trans);
@@ -3261,6 +3251,13 @@ void bch2_trans_put(struct btree_trans *trans)
if (unlikely(trans->journal_replay_not_finished))
bch2_journal_keys_put(c);
+ /*
+ * trans->ref protects trans->locking_wait.task, btree_paths array; used
+ * by cycle detector
+ */
+ closure_return_sync(&trans->ref);
+ trans->locking_wait.task = NULL;
+
unsigned long *paths_allocated = trans->paths_allocated;
trans->paths_allocated = NULL;
trans->paths = NULL;
@@ -3278,8 +3275,6 @@ void bch2_trans_put(struct btree_trans *trans)
trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
if (trans) {
- closure_sync(&trans->ref);
-
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
@@ -3385,8 +3380,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
if (trans) {
- closure_sync(&trans->ref);
-
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
@@ -3406,8 +3399,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
bch2_time_stats_exit(&s->lock_hold_times);
}
- if (c->btree_trans_barrier_initialized)
+ if (c->btree_trans_barrier_initialized) {
+ synchronize_srcu_expedited(&c->btree_trans_barrier);
cleanup_srcu_struct(&c->btree_trans_barrier);
+ }
mempool_exit(&c->btree_trans_mem_pool);
mempool_exit(&c->btree_trans_pool);
}
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 34056aaece00..2d3c0d45c37f 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -32,10 +32,11 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
}
static const struct rhashtable_params bch2_btree_key_cache_params = {
- .head_offset = offsetof(struct bkey_cached, hash),
- .key_offset = offsetof(struct bkey_cached, key),
- .key_len = sizeof(struct bkey_cached_key),
- .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
+ .head_offset = offsetof(struct bkey_cached, hash),
+ .key_offset = offsetof(struct bkey_cached, key),
+ .key_len = sizeof(struct bkey_cached_key),
+ .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
+ .automatic_shrinking = true,
};
__flatten
@@ -840,7 +841,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- freed++;
bc->nr_freed_nonpcpu--;
bc->freed++;
}
@@ -854,7 +854,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- freed++;
bc->nr_freed_pcpu--;
bc->freed++;
}
@@ -876,23 +875,22 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
bc->skipped_dirty++;
- goto next;
} else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
bc->skipped_accessed++;
- goto next;
- } else if (bkey_cached_lock_for_evict(ck)) {
+ } else if (!bkey_cached_lock_for_evict(ck)) {
+ bc->skipped_lock_fail++;
+ } else {
bkey_cached_evict(bc, ck);
bkey_cached_free(bc, ck);
bc->moved_to_freelist++;
- } else {
- bc->skipped_lock_fail++;
+ freed++;
}
scanned++;
if (scanned >= nr)
break;
-next:
+
pos = next;
}
@@ -917,6 +915,14 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
long nr = atomic_long_read(&bc->nr_keys) -
atomic_long_read(&bc->nr_dirty);
+ /*
+ * Avoid hammering our shrinker too much if it's nearly empty - the
+ * shrinker code doesn't take into account how big our cache is, if it's
+ * mostly empty but the system is under memory pressure it causes nasty
+ * lock contention:
+ */
+ nr -= 128;
+
return max(0L, nr);
}
@@ -1025,9 +1031,10 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
if (!shrink)
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
bc->shrink = shrink;
- shrink->seeks = 0;
shrink->count_objects = bch2_btree_key_cache_count;
shrink->scan_objects = bch2_btree_key_cache_scan;
+ shrink->batch = 1 << 14;
+ shrink->seeks = 0;
shrink->private_data = c;
shrinker_register(shrink);
return 0;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index c3e9b0cc7bbd..d66fff22109a 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -215,6 +215,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
if (unlikely(!best)) {
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 45cb8149d374..2cb0442f6cc9 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -72,10 +72,11 @@ static bool found_btree_node_is_readable(struct btree_trans *trans,
struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
bool ret = !IS_ERR_OR_NULL(b);
- if (ret) {
- f->sectors_written = b->written;
- six_unlock_read(&b->c.lock);
- }
+ if (!ret)
+ return ret;
+
+ f->sectors_written = b->written;
+ six_unlock_read(&b->c.lock);
/*
* We might update this node's range; if that happens, we need the node
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d63db4fefe73..87f485e9c552 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -761,13 +761,13 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
- const unsigned mask = 0
+ const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
BCH_BTREE_IDS()
#undef x
;
- return (1U << type) & mask;
+ return BIT_ULL(type) & mask;
}
static inline bool btree_id_is_extents(enum btree_id btree)
@@ -777,35 +777,35 @@ static inline bool btree_id_is_extents(enum btree_id btree)
static inline bool btree_type_has_snapshots(enum btree_id id)
{
- const unsigned mask = 0
+ const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return (1U << id) & mask;
+ return BIT_ULL(id) & mask;
}
static inline bool btree_type_has_snapshot_field(enum btree_id id)
{
- const unsigned mask = 0
+ const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
BCH_BTREE_IDS()
#undef x
;
- return (1U << id) & mask;
+ return BIT_ULL(id) & mask;
}
static inline bool btree_type_has_ptrs(enum btree_id id)
{
- const unsigned mask = 0
+ const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return (1U << id) & mask;
+ return BIT_ULL(id) & mask;
}
struct btree_root {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ed97712d0db1..743d57eba760 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -465,143 +465,172 @@ int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64
return bch2_update_replicas_list(trans, &r.e, sectors);
}
-int bch2_check_fix_ptrs(struct btree_trans *trans,
- enum btree_id btree, unsigned level, struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
+static int bch2_check_fix_ptr(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
+ bool *do_update)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry_c;
- struct extent_ptr_decoded p = { 0 };
- bool do_update = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
- percpu_down_read(&c->mark_lock);
+ struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
+ if (!ca) {
+ if (fsck_err(c, ptr_to_invalid_device,
+ "pointer to missing device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+ return 0;
+ }
- bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
- struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
- if (!ca) {
- if (fsck_err(c, ptr_to_invalid_device,
- "pointer to missing device %u\n"
- "while marking %s",
- p.ptr.dev,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
- continue;
- }
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ if (!g) {
+ if (fsck_err(c, ptr_to_invalid_device,
+ "pointer to invalid bucket on device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+ goto out;
+ }
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry_c);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
- if (fsck_err_on(!g->gen_valid,
- c, ptr_to_missing_alloc_key,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (!p.ptr.cached) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- } else {
- do_update = true;
- }
+ if (fsck_err_on(!g->gen_valid,
+ c, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ } else {
+ *do_update = true;
}
+ }
- if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
- c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (!p.ptr.cached &&
- (g->data_type != BCH_DATA_btree ||
- data_type == BCH_DATA_btree)) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- } else {
- do_update = true;
- }
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached &&
+ (g->data_type != BCH_DATA_btree ||
+ data_type == BCH_DATA_btree)) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ *do_update = true;
+ }
+ }
+
+ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+
+ if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+ c, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+
+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
+ goto out;
+
+ if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
+ c, ptr_bucket_data_type_mismatch,
+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (data_type == BCH_DATA_btree) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = data_type;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ *do_update = true;
}
+ }
+
+ if (p.has_ec) {
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
- if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
- c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ if (fsck_err_on(!m || !m->alive,
+ c, ptr_to_missing_stripe,
+ "pointer to nonexistent stripe %llu\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen,
+ (u64) p.ec.idx,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
+ *do_update = true;
- if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
- c, stale_dirty_ptr,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
+ c, ptr_to_incorrect_stripe,
+ "pointer does not match stripe %llu\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen, g->gen,
+ (u64) p.ec.idx,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
+ *do_update = true;
+ }
+out:
+fsck_err:
+ bch2_dev_put(ca);
+ printbuf_exit(&buf);
+ return ret;
+}
- if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
- goto next;
+int bch2_check_fix_ptrs(struct btree_trans *trans,
+ enum btree_id btree, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry_c;
+ struct extent_ptr_decoded p = { 0 };
+ bool do_update = false;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
- if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
- c, ptr_bucket_data_type_mismatch,
- "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (data_type == BCH_DATA_btree) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = data_type;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- } else {
- do_update = true;
- }
- }
+ percpu_down_read(&c->mark_lock);
- if (p.has_ec) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
-
- if (fsck_err_on(!m || !m->alive, c,
- ptr_to_missing_stripe,
- "pointer to nonexistent stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
-
- if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
- ptr_to_incorrect_stripe,
- "pointer does not match stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
- }
-next:
- bch2_dev_put(ca);
+ bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
+ ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
+ if (ret)
+ goto err;
}
if (do_update) {
@@ -716,7 +745,6 @@ found:
bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
}
err:
-fsck_err:
percpu_up_read(&c->mark_lock);
printbuf_exit(&buf);
return ret;
@@ -987,6 +1015,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
bool insert = !(flags & BTREE_TRIGGER_overwrite);
+ struct printbuf buf = PRINTBUF;
int ret = 0;
struct bch_fs *c = trans->c;
@@ -1019,6 +1048,13 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_gc) {
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, bucket.offset);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
+ p.ptr.dev,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = -EIO;
+ goto err_unlock;
+ }
+
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new);
@@ -1027,10 +1063,12 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
bch2_dev_usage_update(c, ca, &old, &new, 0, true);
}
bucket_unlock(g);
+err_unlock:
percpu_up_read(&c->mark_lock);
}
err:
bch2_dev_put(ca);
+ printbuf_exit(&buf);
return ret;
}
@@ -1318,10 +1356,11 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
u64 b, enum bch_data_type data_type, unsigned sectors,
enum btree_iter_update_trigger_flags flags)
{
- int ret = 0;
-
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, b);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
+ ca->dev_idx, bch2_data_type_str(data_type)))
+ goto err_unlock;
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
@@ -1330,29 +1369,27 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
g->data_type != data_type, c,
"different types of data in same bucket: %s, %s",
bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type))) {
- ret = -EIO;
+ bch2_data_type_str(data_type)))
goto err;
- }
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
"bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
ca->dev_idx, b, g->gen,
bch2_data_type_str(g->data_type ?: data_type),
- g->dirty_sectors, sectors)) {
- ret = -EIO;
+ g->dirty_sectors, sectors))
goto err;
- }
g->data_type = data_type;
g->dirty_sectors += sectors;
struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
+ bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+ percpu_up_read(&c->mark_lock);
+ return 0;
err:
bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+err_unlock:
percpu_up_read(&c->mark_lock);
- return ret;
+ return -EIO;
}
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
@@ -1595,6 +1632,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bucket_gens->first_bucket = ca->mi.first_bucket;
bucket_gens->nbuckets = nbuckets;
+ bucket_gens->nbuckets_minus_first =
+ bucket_gens->nbuckets - bucket_gens->first_bucket;
if (resize) {
down_write(&c->gc_lock);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 617ffde2fb7a..80ee0be9793e 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -93,7 +93,8 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
struct bucket_array *buckets = gc_bucket_array(ca);
- BUG_ON(!bucket_valid(ca, b));
+ if (b - buckets->first_bucket >= buckets->nbuckets_minus_first)
+ return NULL;
return buckets->b + b;
}
@@ -110,7 +111,8 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
{
struct bucket_gens *gens = bucket_gens(ca);
- BUG_ON(!bucket_valid(ca, b));
+ if (b - gens->first_bucket >= gens->nbuckets_minus_first)
+ return NULL;
return gens->b + b;
}
@@ -170,19 +172,22 @@ static inline int gen_after(u8 a, u8 b)
return r > 0 ? r : 0;
}
-static inline u8 dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
+static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
- return gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+ u8 *gen = bucket_gen(ca, PTR_BUCKET_NR(ca, ptr));
+ if (!gen)
+ return -1;
+ return gen_after(*gen, ptr->gen);
}
/**
* dev_ptr_stale() - check if a pointer points into a bucket that has been
* invalidated.
*/
-static inline u8 dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
+static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
rcu_read_lock();
- u8 ret = dev_ptr_stale_rcu(ca, ptr);
+ int ret = dev_ptr_stale_rcu(ca, ptr);
rcu_read_unlock();
return ret;
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 6a31740222a7..f636e17c4caf 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -22,6 +22,7 @@ struct bucket_array {
struct rcu_head rcu;
u16 first_bucket;
size_t nbuckets;
+ size_t nbuckets_minus_first;
struct bucket b[];
};
@@ -29,6 +30,7 @@ struct bucket_gens {
struct rcu_head rcu;
u16 first_bucket;
size_t nbuckets;
+ size_t nbuckets_minus_first;
u8 b[];
};
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 9e54323f0f5f..6d82e1165adc 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -216,7 +216,8 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
ret = PTR_ERR_OR_ZERO(optstr) ?:
bch2_parse_mount_opts(NULL, &thr->opts, optstr);
- kfree(optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
if (ret)
goto err;
@@ -319,7 +320,8 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
return ret;
ret = bch2_dev_add(c, path);
- kfree(path);
+ if (!IS_ERR(path))
+ kfree(path);
return ret;
}
@@ -850,7 +852,8 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
ret = PTR_ERR_OR_ZERO(optstr) ?:
bch2_parse_mount_opts(c, &thr->opts, optstr);
- kfree(optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
if (ret)
goto err;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0d807c2ce9c6..1a0072eef109 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -202,9 +202,8 @@ restart_drop_conflicting_replicas:
bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
/* Now, drop excess replicas: */
-restart_drop_extra_replicas:
-
rcu_read_lock();
+restart_drop_extra_replicas:
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 51cbf3928361..f0d4727c4dc2 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -568,6 +568,32 @@ static const struct file_operations cached_btree_nodes_ops = {
.read = bch2_cached_btree_nodes_read,
};
+typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r);
+
+static void list_sort(struct list_head *head, list_cmp_fn cmp)
+{
+ struct list_head *pos;
+
+ list_for_each(pos, head)
+ while (!list_is_last(pos, head) &&
+ cmp(pos, pos->next) > 0) {
+ struct list_head *pos2, *next = pos->next;
+
+ list_del(next);
+ list_for_each(pos2, head)
+ if (cmp(next, pos2) < 0)
+ goto pos_found;
+ BUG();
+pos_found:
+ list_add_tail(next, pos2);
+ }
+}
+
+static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r)
+{
+ return cmp_int(l, r);
+}
+
static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
@@ -575,41 +601,39 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
struct bch_fs *c = i->c;
struct btree_trans *trans;
ssize_t ret = 0;
- u32 seq;
i->ubuf = buf;
i->size = size;
i->ret = 0;
restart:
seqmutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+ list_sort(&c->btree_trans_list, list_ptr_order_cmp);
- if (!task || task->pid <= i->iter)
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if ((ulong) trans < i->iter)
continue;
- closure_get(&trans->ref);
- seq = seqmutex_seq(&c->btree_trans_lock);
- seqmutex_unlock(&c->btree_trans_lock);
+ i->iter = (ulong) trans;
- ret = flush_buf(i);
- if (ret) {
- closure_put(&trans->ref);
- goto unlocked;
- }
+ if (!closure_get_not_zero(&trans->ref))
+ continue;
+
+ u32 seq = seqmutex_unlock(&c->btree_trans_lock);
bch2_btree_trans_to_text(&i->buf, trans);
prt_printf(&i->buf, "backtrace:\n");
printbuf_indent_add(&i->buf, 2);
- bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
+ bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);
- i->iter = task->pid;
-
closure_put(&trans->ref);
+ ret = flush_buf(i);
+ if (ret)
+ goto unlocked;
+
if (!seqmutex_relock(&c->btree_trans_lock, seq))
goto restart;
}
@@ -804,50 +828,55 @@ static const struct file_operations btree_transaction_stats_op = {
.read = btree_transaction_stats_read,
};
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
+/* walk btree transactions until we find a deadlock and print it */
+static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
struct btree_trans *trans;
- ssize_t ret = 0;
- u32 seq;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- if (i->iter)
- goto out;
+ pid_t iter = 0;
restart:
seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
struct task_struct *task = READ_ONCE(trans->locking_wait.task);
- if (!task || task->pid <= i->iter)
+ if (!task || task->pid <= iter)
continue;
- closure_get(&trans->ref);
- seq = seqmutex_seq(&c->btree_trans_lock);
- seqmutex_unlock(&c->btree_trans_lock);
+ iter = task->pid;
- ret = flush_buf(i);
- if (ret) {
- closure_put(&trans->ref);
- goto out;
- }
+ if (!closure_get_not_zero(&trans->ref))
+ continue;
- bch2_check_for_deadlock(trans, &i->buf);
+ u32 seq = seqmutex_unlock(&c->btree_trans_lock);
- i->iter = task->pid;
+ bool found = bch2_check_for_deadlock(trans, out) != 0;
closure_put(&trans->ref);
+ if (found)
+ return;
+
if (!seqmutex_relock(&c->btree_trans_lock, seq))
goto restart;
}
seqmutex_unlock(&c->btree_trans_lock);
-out:
+}
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (!i->iter) {
+ btree_deadlock_to_text(&i->buf, c);
+ i->iter++;
+ }
+
if (i->buf.allocation_failure)
ret = -ENOMEM;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d8b9beca3776..83e279d41829 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -268,6 +268,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
@@ -289,6 +290,13 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_gc) {
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, bucket.offset);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
+ ptr->dev,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -EIO;
+ goto err_unlock;
+ }
+
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
@@ -297,10 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans,
bch2_dev_usage_update(c, ca, &old, &new, 0, true);
}
bucket_unlock(g);
+err_unlock:
percpu_up_read(&c->mark_lock);
}
err:
bch2_dev_put(ca);
+ printbuf_exit(&buf);
return ret;
}
@@ -714,10 +724,12 @@ static void ec_block_endio(struct bio *bio)
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
- if (dev_ptr_stale(ca, ptr)) {
+ int stale = dev_ptr_stale(ca, ptr);
+ if (stale) {
bch_err_ratelimited(ca->fs,
- "error %s stripe: stale pointer after io",
- bio_data_dir(bio) == READ ? "reading from" : "writing to");
+ "error %s stripe: stale/invalid pointer (%i) after io",
+ bio_data_dir(bio) == READ ? "reading from" : "writing to",
+ stale);
clear_bit(ec_bio->idx, ec_bio->buf->valid);
}
@@ -743,10 +755,12 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
return;
}
- if (dev_ptr_stale(ca, ptr)) {
+ int stale = dev_ptr_stale(ca, ptr);
+ if (stale) {
bch_err_ratelimited(c,
- "error %s stripe: stale pointer",
- rw == READ ? "reading from" : "writing to");
+ "error %s stripe: stale pointer (%i)",
+ rw == READ ? "reading from" : "writing to",
+ stale);
clear_bit(idx, buf->valid);
return;
}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index dbe35b80bc0b..58612abf7927 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -116,6 +116,9 @@
x(ENOENT, ENOENT_dev_idx_not_found) \
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
+ x(EEXIST, EEXIST_str_hash_set) \
+ x(EEXIST, EEXIST_discard_in_flight_add) \
+ x(EEXIST, EEXIST_subvolume_create) \
x(0, open_buckets_empty) \
x(0, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index c66eeffcd7f2..d95c40f1b6af 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -15,6 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
switch (c->opts.errors) {
case BCH_ON_ERROR_continue:
return false;
+ case BCH_ON_ERROR_fix_safe:
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
@@ -191,6 +192,12 @@ static void prt_actioning(struct printbuf *out, const char *action)
prt_str(out, "ing");
}
+static const u8 fsck_flags_extra[] = {
+#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags,
+ BCH_SB_ERRS()
+#undef x
+};
+
int bch2_fsck_err(struct bch_fs *c,
enum bch_fsck_flags flags,
enum bch_sb_error_id err,
@@ -203,6 +210,9 @@ int bch2_fsck_err(struct bch_fs *c,
int ret = -BCH_ERR_fsck_ignore;
const char *action_orig = "fix?", *action = action_orig;
+ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
+ flags |= fsck_flags_extra[err];
+
if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent))
return -BCH_ERR_fsck_fix;
@@ -265,7 +275,14 @@ int bch2_fsck_err(struct bch_fs *c,
prt_printf(out, bch2_log_msg(c, ""));
#endif
- if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
+ if ((flags & FSCK_CAN_FIX) &&
+ (flags & FSCK_AUTOFIX) &&
+ (c->opts.errors == BCH_ON_ERROR_continue ||
+ c->opts.errors == BCH_ON_ERROR_fix_safe)) {
+ prt_str(out, ", ");
+ prt_actioning(out, action);
+ ret = -BCH_ERR_fsck_fix;
+ } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
if (c->opts.errors != BCH_ON_ERROR_continue ||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
prt_str(out, ", shutting down");
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 36caedf72d89..777711504c35 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -108,13 +108,6 @@ struct fsck_err_state {
char *last_msg;
};
-enum bch_fsck_flags {
- FSCK_CAN_FIX = 1 << 0,
- FSCK_CAN_IGNORE = 1 << 1,
- FSCK_NEED_FSCK = 1 << 2,
- FSCK_NO_RATELIMIT = 1 << 3,
-};
-
#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
__printf(4, 5) __cold
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 469037929685..410b8bd81b5a 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
- if (p.ptr.cached && (!ca || dev_ptr_stale(ca, &p.ptr)))
+ if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
@@ -999,7 +999,7 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
(ca = bch2_dev_rcu(c, ptr->dev)) &&
- dev_ptr_stale_rcu(ca, ptr));
+ dev_ptr_stale_rcu(ca, ptr) > 0);
rcu_read_unlock();
return bkey_deleted(k.k);
@@ -1024,8 +1024,11 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
prt_str(out, " cached");
if (ptr->unwritten)
prt_str(out, " unwritten");
- if (bucket_valid(ca, b) && dev_ptr_stale_rcu(ca, ptr))
+ int stale = dev_ptr_stale_rcu(ca, ptr);
+ if (stale > 0)
prt_printf(out, " stale");
+ else if (stale)
+ prt_printf(out, " invalid");
}
rcu_read_unlock();
--out->atomic;
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 205a323ffc6d..79a0c8732bce 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -308,8 +308,8 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
return ret;
}
-static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
{
struct inode *dir;
struct bch_inode_info *inode;
@@ -373,7 +373,7 @@ retry:
}
if (dst_dentry->d_inode) {
- error = -EEXIST;
+ error = -BCH_ERR_EEXIST_subvolume_create;
goto err3;
}
@@ -406,9 +406,12 @@ retry:
!arg.src_ptr)
snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
+ down_write(&c->snapshot_create_lock);
inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
dst_dentry, arg.mode|S_IFDIR,
0, snapshot_src, create_flags);
+ up_write(&c->snapshot_create_lock);
+
error = PTR_ERR_OR_ZERO(inode);
if (error)
goto err3;
@@ -429,16 +432,6 @@ err1:
return error;
}
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
-{
- down_write(&c->snapshot_create_lock);
- long ret = __bch2_ioctl_subvolume_create(c, filp, arg);
- up_write(&c->snapshot_create_lock);
-
- return ret;
-}
-
static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
struct bch_ioctl_subvolume arg)
{
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index cd388f1702dc..f9c9a95d7d4c 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -188,6 +188,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
BUG_ON(!old);
if (unlikely(old != inode)) {
+ /*
+ * bcachefs doesn't use I_NEW; we have no use for it since we
+ * only insert fully created inodes in the inode hash table. But
+ * discard_new_inode() expects it to be set...
+ */
+ inode->v.i_flags |= I_NEW;
discard_new_inode(&inode->v);
inode = old;
} else {
@@ -195,8 +201,10 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
/*
- * we really don't want insert_inode_locked2() to be setting
- * I_NEW...
+ * Again, I_NEW makes no sense for bcachefs. This is only needed
+ * for clearing I_NEW, but since the inode was already fully
+ * created and initialized we didn't actually want
+ * inode_insert5() to set it for us.
*/
unlock_new_inode(&inode->v);
}
@@ -227,7 +235,9 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
mutex_init(&inode->ei_update_lock);
two_state_lock_init(&inode->ei_pagecache_lock);
INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
+ inode->ei_flags = 0;
mutex_init(&inode->ei_quota_lock);
+ memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
inode->v.i_state = 0;
if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
@@ -1155,6 +1165,7 @@ static const struct file_operations bch_file_operations = {
.read_iter = bch2_read_iter,
.write_iter = bch2_write_iter,
.mmap = bch2_mmap,
+ .get_unmapped_area = thp_get_unmapped_area,
.fsync = bch2_fsync,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
@@ -1486,11 +1497,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
bch2_iget5_set(&inode->v, &inum);
bch2_inode_update_after_write(trans, inode, bi, ~0);
- if (BCH_SUBVOLUME_SNAP(subvol))
- set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
- else
- clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
-
inode->v.i_blocks = bi->bi_sectors;
inode->v.i_ino = bi->bi_inum;
inode->v.i_rdev = bi->bi_dev;
@@ -1502,6 +1508,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
inode->ei_qid = bch_qid(bi);
inode->ei_subvol = inum.subvol;
+ if (BCH_SUBVOLUME_SNAP(subvol))
+ set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+
inode->v.i_mapping->a_ops = &bch_address_space_operations;
switch (inode->v.i_mode & S_IFMT) {
@@ -1967,6 +1976,7 @@ got_sb:
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
sb->s_uuid = c->sb.user_uuid;
+ sb->s_shrink->seeks = 0;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index fd277bd58ed3..921bcdb3e5e4 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1677,6 +1677,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
trans_was_restarted(trans, restart_count);
}
+noinline_for_stack
static int check_dirent_inode_dirent(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
@@ -1773,6 +1774,7 @@ out_noiter:
return ret;
}
+noinline_for_stack
static int check_dirent_target(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
@@ -1847,6 +1849,7 @@ found:
return ret;
}
+noinline_for_stack
static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c_dirent d)
{
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index f57486794484..c97fa7002b06 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -84,9 +84,10 @@ struct promote_op {
};
static const struct rhashtable_params bch_promote_params = {
- .head_offset = offsetof(struct promote_op, hash),
- .key_offset = offsetof(struct promote_op, pos),
- .key_len = sizeof(struct bpos),
+ .head_offset = offsetof(struct promote_op, hash),
+ .key_offset = offsetof(struct promote_op, pos),
+ .key_len = sizeof(struct bpos),
+ .automatic_shrinking = true,
};
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
@@ -776,18 +777,32 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
PTR_BUCKET_POS(ca, &ptr),
BTREE_ITER_cached);
- prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
- printbuf_indent_add(&buf, 2);
+ u8 *gen = bucket_gen(ca, iter.pos.offset);
+ if (gen) {
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
+ prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
+ printbuf_indent_add(&buf, 2);
- prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
-
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
- if (!ret) {
+ bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
+
+ prt_printf(&buf, "memory gen: %u", *gen);
+
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (!ret) {
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ }
+ } else {
+ prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
+ iter.pos.inode, iter.pos.offset);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "first bucket %u nbuckets %llu\n",
+ ca->mi.first_bucket, ca->mi.nbuckets);
+
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
}
bch2_fs_inconsistent(c, "%s", buf.buf);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 9401d13e31bb..05e0cbef420b 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1220,7 +1220,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
u32 snapshot;
struct bucket_to_lock *stale_at;
- int ret;
+ int stale, ret;
if (op->flags & BCH_WRITE_MOVE)
return;
@@ -1299,7 +1299,8 @@ retry:
BUCKET_NOCOW_LOCK_UPDATE);
rcu_read_lock();
- bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen);
+ u8 *gen = bucket_gen(ca, i->b.offset);
+ stale = !gen ? -1 : gen_after(*gen, i->gen);
rcu_read_unlock();
if (unlikely(stale)) {
@@ -1380,8 +1381,18 @@ err_bucket_stale:
break;
}
- /* We can retry this: */
- ret = -BCH_ERR_transaction_restart;
+ struct printbuf buf = PRINTBUF;
+ if (bch2_fs_inconsistent_on(stale < 0, c,
+ "pointer to invalid bucket in nocow path on device %llu\n %s",
+ stale_at->b.inode,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = -EIO;
+ } else {
+ /* We can retry this: */
+ ret = -BCH_ERR_transaction_restart;
+ }
+ printbuf_exit(&buf);
+
goto err_get_ioref;
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index adec8e1ea73e..13669dd0e375 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1167,6 +1167,9 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
void bch2_fs_journal_stop(struct journal *j)
{
+ if (!test_bit(JOURNAL_running, &j->flags))
+ return;
+
bch2_journal_reclaim_stop(j);
bch2_journal_flush_all_pins(j);
@@ -1518,6 +1521,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
struct journal_entry_pin *pin;
spin_lock(&j->lock);
+ if (!test_bit(JOURNAL_running, &j->flags)) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
*seq = max(*seq, j->pin.front);
if (*seq >= j->pin.back) {
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index cdcb1ad49af4..db24ce21b2ac 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1677,6 +1677,13 @@ static CLOSURE_CALLBACK(journal_write_done)
mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
}
+ /*
+ * We don't typically trigger journal writes from her - the next journal
+ * write will be triggered immediately after the previous one is
+ * allocated, in bch2_journal_write() - but the journal write error path
+ * is special:
+ */
+ bch2_journal_do_writes(j);
spin_unlock(&j->lock);
}
@@ -1967,7 +1974,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_replicas_padded replicas;
- struct printbuf journal_debug_buf = PRINTBUF;
unsigned nr_rw_members = 0;
int ret;
@@ -2011,11 +2017,15 @@ CLOSURE_CALLBACK(bch2_journal_write)
}
if (ret) {
- __bch2_journal_debug_to_text(&journal_debug_buf, j);
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
+ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write: %s"),
+ bch2_err_str(ret));
+ __bch2_journal_debug_to_text(&buf, j);
spin_unlock(&j->lock);
- bch_err(c, "Unable to allocate journal write:\n%s",
- journal_debug_buf.buf);
- printbuf_exit(&journal_debug_buf);
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
goto err;
}
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index ed4846709611..1f25c111c54c 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -232,7 +232,7 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c)
BUG_ON(nr != t->nr);
unsigned i;
- for (src = bl->start, i = eytzinger0_first(t->nr);
+ for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
src < bl->start + nr;
src++, i = eytzinger0_next(i, nr)) {
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index fb11ab0dd00e..bd71ba77de07 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -2,9 +2,6 @@
#ifndef _BCACHEFS_LRU_H
#define _BCACHEFS_LRU_H
-#define LRU_TIME_BITS 48
-#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
-
static inline u64 lru_pos_id(struct bpos pos)
{
return pos.inode >> LRU_TIME_BITS;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8171f947fac8..6e477fadaa2a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -547,6 +547,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
ctxt->stats->pos = BBPOS(btree_id, start);
}
+ bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, btree_id, start,
BTREE_ITER_prefetch|
BTREE_ITER_all_snapshots);
@@ -920,7 +921,20 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
? c->opts.metadata_replicas
: io_opts->data_replicas;
- if (!nr_good || nr_good >= replicas)
+ rcu_read_lock();
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned i = 0;
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ptr->cached &&
+ (!ca || !ca->mi.durability))
+ data_opts->kill_ptrs |= BIT(i);
+ i++;
+ }
+ rcu_read_unlock();
+
+ if (!data_opts->kill_ptrs &&
+ (!nr_good || nr_good >= replicas))
return false;
data_opts->target = 0;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 10bfb31c151b..eb49dd045eff 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -35,9 +35,10 @@ struct buckets_in_flight {
};
static const struct rhashtable_params bch_move_bucket_params = {
- .head_offset = offsetof(struct move_bucket_in_flight, hash),
- .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
- .key_len = sizeof(struct move_bucket_key),
+ .head_offset = offsetof(struct move_bucket_in_flight, hash),
+ .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
+ .key_len = sizeof(struct move_bucket_key),
+ .automatic_shrinking = true,
};
static struct move_bucket_in_flight *
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 25530e0bb2f3..b197ec90d4cb 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -137,7 +137,7 @@ enum fsck_err_opts {
x(errors, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \
- BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \
+ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
NULL, "Action to take on filesystem error") \
x(metadata_replicas, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index cf513fc79ce4..1f9d044ed920 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -326,6 +326,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
case BCH_JSET_ENTRY_btree_root: {
struct btree_root *r;
+ if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
+ c, invalid_btree_id,
+ "invalid btree id %u (max %u)",
+ entry->btree_id, BTREE_ID_NR_MAX))
+ return 0;
+
while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
if (ret)
@@ -415,7 +421,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
}
}
-
+fsck_err:
return ret;
}
@@ -658,10 +664,10 @@ int bch2_fs_recovery(struct bch_fs *c)
if (check_version_upgrade(c))
write_sb = true;
+ c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+
if (write_sb)
bch2_write_super(c);
-
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
mutex_unlock(&c->sb_lock);
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 3fb23e399ffb..4710b61631f0 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -228,7 +228,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
dst = (void *) &darray_top(table);
dst->version = cpu_to_le16(src->version);
- dst->recovery_passes[0] = cpu_to_le64(src->recovery_passes);
+ dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes));
dst->recovery_passes[1] = 0;
dst->nr_errors = cpu_to_le16(src->nr_errors);
for (unsigned i = 0; i < src->nr_errors; i++)
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index bda33e59e226..c1270d790e43 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -110,19 +110,25 @@ out:
void bch2_sb_errors_from_cpu(struct bch_fs *c)
{
bch_sb_errors_cpu *src = &c->fsck_error_counts;
- struct bch_sb_field_errors *dst =
- bch2_sb_field_resize(&c->disk_sb, errors,
- bch2_sb_field_errors_u64s(src->nr));
+ struct bch_sb_field_errors *dst;
unsigned i;
+ mutex_lock(&c->fsck_error_counts_lock);
+
+ dst = bch2_sb_field_resize(&c->disk_sb, errors,
+ bch2_sb_field_errors_u64s(src->nr));
+
if (!dst)
- return;
+ goto err;
for (i = 0; i < src->nr; i++) {
SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
}
+
+err:
+ mutex_unlock(&c->fsck_error_counts_lock);
}
static int bch2_sb_errors_to_cpu(struct bch_fs *c)
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 84d2763bd597..d6f35a99c429 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -2,281 +2,294 @@
#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H
#define _BCACHEFS_SB_ERRORS_FORMAT_H
-#define BCH_SB_ERRS() \
- x(clean_but_journal_not_empty, 0) \
- x(dirty_but_no_journal_entries, 1) \
- x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \
- x(sb_clean_journal_seq_mismatch, 3) \
- x(sb_clean_btree_root_mismatch, 4) \
- x(sb_clean_missing, 5) \
- x(jset_unsupported_version, 6) \
- x(jset_unknown_csum, 7) \
- x(jset_last_seq_newer_than_seq, 8) \
- x(jset_past_bucket_end, 9) \
- x(jset_seq_blacklisted, 10) \
- x(journal_entries_missing, 11) \
- x(journal_entry_replicas_not_marked, 12) \
- x(journal_entry_past_jset_end, 13) \
- x(journal_entry_replicas_data_mismatch, 14) \
- x(journal_entry_bkey_u64s_0, 15) \
- x(journal_entry_bkey_past_end, 16) \
- x(journal_entry_bkey_bad_format, 17) \
- x(journal_entry_bkey_invalid, 18) \
- x(journal_entry_btree_root_bad_size, 19) \
- x(journal_entry_blacklist_bad_size, 20) \
- x(journal_entry_blacklist_v2_bad_size, 21) \
- x(journal_entry_blacklist_v2_start_past_end, 22) \
- x(journal_entry_usage_bad_size, 23) \
- x(journal_entry_data_usage_bad_size, 24) \
- x(journal_entry_clock_bad_size, 25) \
- x(journal_entry_clock_bad_rw, 26) \
- x(journal_entry_dev_usage_bad_size, 27) \
- x(journal_entry_dev_usage_bad_dev, 28) \
- x(journal_entry_dev_usage_bad_pad, 29) \
- x(btree_node_unreadable, 30) \
- x(btree_node_fault_injected, 31) \
- x(btree_node_bad_magic, 32) \
- x(btree_node_bad_seq, 33) \
- x(btree_node_unsupported_version, 34) \
- x(btree_node_bset_older_than_sb_min, 35) \
- x(btree_node_bset_newer_than_sb, 36) \
- x(btree_node_data_missing, 37) \
- x(btree_node_bset_after_end, 38) \
- x(btree_node_replicas_sectors_written_mismatch, 39) \
- x(btree_node_replicas_data_mismatch, 40) \
- x(bset_unknown_csum, 41) \
- x(bset_bad_csum, 42) \
- x(bset_past_end_of_btree_node, 43) \
- x(bset_wrong_sector_offset, 44) \
- x(bset_empty, 45) \
- x(bset_bad_seq, 46) \
- x(bset_blacklisted_journal_seq, 47) \
- x(first_bset_blacklisted_journal_seq, 48) \
- x(btree_node_bad_btree, 49) \
- x(btree_node_bad_level, 50) \
- x(btree_node_bad_min_key, 51) \
- x(btree_node_bad_max_key, 52) \
- x(btree_node_bad_format, 53) \
- x(btree_node_bkey_past_bset_end, 54) \
- x(btree_node_bkey_bad_format, 55) \
- x(btree_node_bad_bkey, 56) \
- x(btree_node_bkey_out_of_order, 57) \
- x(btree_root_bkey_invalid, 58) \
- x(btree_root_read_error, 59) \
- x(btree_root_bad_min_key, 60) \
- x(btree_root_bad_max_key, 61) \
- x(btree_node_read_error, 62) \
- x(btree_node_topology_bad_min_key, 63) \
- x(btree_node_topology_bad_max_key, 64) \
- x(btree_node_topology_overwritten_by_prev_node, 65) \
- x(btree_node_topology_overwritten_by_next_node, 66) \
- x(btree_node_topology_interior_node_empty, 67) \
- x(fs_usage_hidden_wrong, 68) \
- x(fs_usage_btree_wrong, 69) \
- x(fs_usage_data_wrong, 70) \
- x(fs_usage_cached_wrong, 71) \
- x(fs_usage_reserved_wrong, 72) \
- x(fs_usage_persistent_reserved_wrong, 73) \
- x(fs_usage_nr_inodes_wrong, 74) \
- x(fs_usage_replicas_wrong, 75) \
- x(dev_usage_buckets_wrong, 76) \
- x(dev_usage_sectors_wrong, 77) \
- x(dev_usage_fragmented_wrong, 78) \
- x(dev_usage_buckets_ec_wrong, 79) \
- x(bkey_version_in_future, 80) \
- x(bkey_u64s_too_small, 81) \
- x(bkey_invalid_type_for_btree, 82) \
- x(bkey_extent_size_zero, 83) \
- x(bkey_extent_size_greater_than_offset, 84) \
- x(bkey_size_nonzero, 85) \
- x(bkey_snapshot_nonzero, 86) \
- x(bkey_snapshot_zero, 87) \
- x(bkey_at_pos_max, 88) \
- x(bkey_before_start_of_btree_node, 89) \
- x(bkey_after_end_of_btree_node, 90) \
- x(bkey_val_size_nonzero, 91) \
- x(bkey_val_size_too_small, 92) \
- x(alloc_v1_val_size_bad, 93) \
- x(alloc_v2_unpack_error, 94) \
- x(alloc_v3_unpack_error, 95) \
- x(alloc_v4_val_size_bad, 96) \
- x(alloc_v4_backpointers_start_bad, 97) \
- x(alloc_key_data_type_bad, 98) \
- x(alloc_key_empty_but_have_data, 99) \
- x(alloc_key_dirty_sectors_0, 100) \
- x(alloc_key_data_type_inconsistency, 101) \
- x(alloc_key_to_missing_dev_bucket, 102) \
- x(alloc_key_cached_inconsistency, 103) \
- x(alloc_key_cached_but_read_time_zero, 104) \
- x(alloc_key_to_missing_lru_entry, 105) \
- x(alloc_key_data_type_wrong, 106) \
- x(alloc_key_gen_wrong, 107) \
- x(alloc_key_dirty_sectors_wrong, 108) \
- x(alloc_key_cached_sectors_wrong, 109) \
- x(alloc_key_stripe_wrong, 110) \
- x(alloc_key_stripe_redundancy_wrong, 111) \
- x(bucket_sector_count_overflow, 112) \
- x(bucket_metadata_type_mismatch, 113) \
- x(need_discard_key_wrong, 114) \
- x(freespace_key_wrong, 115) \
- x(freespace_hole_missing, 116) \
- x(bucket_gens_val_size_bad, 117) \
- x(bucket_gens_key_wrong, 118) \
- x(bucket_gens_hole_wrong, 119) \
- x(bucket_gens_to_invalid_dev, 120) \
- x(bucket_gens_to_invalid_buckets, 121) \
- x(bucket_gens_nonzero_for_invalid_buckets, 122) \
- x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
- x(need_discard_freespace_key_bad, 124) \
- x(backpointer_bucket_offset_wrong, 125) \
- x(backpointer_to_missing_device, 126) \
- x(backpointer_to_missing_alloc, 127) \
- x(backpointer_to_missing_ptr, 128) \
- x(lru_entry_at_time_0, 129) \
- x(lru_entry_to_invalid_bucket, 130) \
- x(lru_entry_bad, 131) \
- x(btree_ptr_val_too_big, 132) \
- x(btree_ptr_v2_val_too_big, 133) \
- x(btree_ptr_has_non_ptr, 134) \
- x(extent_ptrs_invalid_entry, 135) \
- x(extent_ptrs_no_ptrs, 136) \
- x(extent_ptrs_too_many_ptrs, 137) \
- x(extent_ptrs_redundant_crc, 138) \
- x(extent_ptrs_redundant_stripe, 139) \
- x(extent_ptrs_unwritten, 140) \
- x(extent_ptrs_written_and_unwritten, 141) \
- x(ptr_to_invalid_device, 142) \
- x(ptr_to_duplicate_device, 143) \
- x(ptr_after_last_bucket, 144) \
- x(ptr_before_first_bucket, 145) \
- x(ptr_spans_multiple_buckets, 146) \
- x(ptr_to_missing_backpointer, 147) \
- x(ptr_to_missing_alloc_key, 148) \
- x(ptr_to_missing_replicas_entry, 149) \
- x(ptr_to_missing_stripe, 150) \
- x(ptr_to_incorrect_stripe, 151) \
- x(ptr_gen_newer_than_bucket_gen, 152) \
- x(ptr_too_stale, 153) \
- x(stale_dirty_ptr, 154) \
- x(ptr_bucket_data_type_mismatch, 155) \
- x(ptr_cached_and_erasure_coded, 156) \
- x(ptr_crc_uncompressed_size_too_small, 157) \
- x(ptr_crc_csum_type_unknown, 158) \
- x(ptr_crc_compression_type_unknown, 159) \
- x(ptr_crc_redundant, 160) \
- x(ptr_crc_uncompressed_size_too_big, 161) \
- x(ptr_crc_nonce_mismatch, 162) \
- x(ptr_stripe_redundant, 163) \
- x(reservation_key_nr_replicas_invalid, 164) \
- x(reflink_v_refcount_wrong, 165) \
- x(reflink_p_to_missing_reflink_v, 166) \
- x(stripe_pos_bad, 167) \
- x(stripe_val_size_bad, 168) \
- x(stripe_sector_count_wrong, 169) \
- x(snapshot_tree_pos_bad, 170) \
- x(snapshot_tree_to_missing_snapshot, 171) \
- x(snapshot_tree_to_missing_subvol, 172) \
- x(snapshot_tree_to_wrong_subvol, 173) \
- x(snapshot_tree_to_snapshot_subvol, 174) \
- x(snapshot_pos_bad, 175) \
- x(snapshot_parent_bad, 176) \
- x(snapshot_children_not_normalized, 177) \
- x(snapshot_child_duplicate, 178) \
- x(snapshot_child_bad, 179) \
- x(snapshot_skiplist_not_normalized, 180) \
- x(snapshot_skiplist_bad, 181) \
- x(snapshot_should_not_have_subvol, 182) \
- x(snapshot_to_bad_snapshot_tree, 183) \
- x(snapshot_bad_depth, 184) \
- x(snapshot_bad_skiplist, 185) \
- x(subvol_pos_bad, 186) \
- x(subvol_not_master_and_not_snapshot, 187) \
- x(subvol_to_missing_root, 188) \
- x(subvol_root_wrong_bi_subvol, 189) \
- x(bkey_in_missing_snapshot, 190) \
- x(inode_pos_inode_nonzero, 191) \
- x(inode_pos_blockdev_range, 192) \
- x(inode_unpack_error, 193) \
- x(inode_str_hash_invalid, 194) \
- x(inode_v3_fields_start_bad, 195) \
- x(inode_snapshot_mismatch, 196) \
- x(inode_unlinked_but_clean, 197) \
- x(inode_unlinked_but_nlink_nonzero, 198) \
- x(inode_checksum_type_invalid, 199) \
- x(inode_compression_type_invalid, 200) \
- x(inode_subvol_root_but_not_dir, 201) \
- x(inode_i_size_dirty_but_clean, 202) \
- x(inode_i_sectors_dirty_but_clean, 203) \
- x(inode_i_sectors_wrong, 204) \
- x(inode_dir_wrong_nlink, 205) \
- x(inode_dir_multiple_links, 206) \
- x(inode_multiple_links_but_nlink_0, 207) \
- x(inode_wrong_backpointer, 208) \
- x(inode_wrong_nlink, 209) \
- x(inode_unreachable, 210) \
- x(deleted_inode_but_clean, 211) \
- x(deleted_inode_missing, 212) \
- x(deleted_inode_is_dir, 213) \
- x(deleted_inode_not_unlinked, 214) \
- x(extent_overlapping, 215) \
- x(extent_in_missing_inode, 216) \
- x(extent_in_non_reg_inode, 217) \
- x(extent_past_end_of_inode, 218) \
- x(dirent_empty_name, 219) \
- x(dirent_val_too_big, 220) \
- x(dirent_name_too_long, 221) \
- x(dirent_name_embedded_nul, 222) \
- x(dirent_name_dot_or_dotdot, 223) \
- x(dirent_name_has_slash, 224) \
- x(dirent_d_type_wrong, 225) \
- x(inode_bi_parent_wrong, 226) \
- x(dirent_in_missing_dir_inode, 227) \
- x(dirent_in_non_dir_inode, 228) \
- x(dirent_to_missing_inode, 229) \
- x(dirent_to_missing_subvol, 230) \
- x(dirent_to_itself, 231) \
- x(quota_type_invalid, 232) \
- x(xattr_val_size_too_small, 233) \
- x(xattr_val_size_too_big, 234) \
- x(xattr_invalid_type, 235) \
- x(xattr_name_invalid_chars, 236) \
- x(xattr_in_missing_inode, 237) \
- x(root_subvol_missing, 238) \
- x(root_dir_missing, 239) \
- x(root_inode_not_dir, 240) \
- x(dir_loop, 241) \
- x(hash_table_key_duplicate, 242) \
- x(hash_table_key_wrong_offset, 243) \
- x(unlinked_inode_not_on_deleted_list, 244) \
- x(reflink_p_front_pad_bad, 245) \
- x(journal_entry_dup_same_device, 246) \
- x(inode_bi_subvol_missing, 247) \
- x(inode_bi_subvol_wrong, 248) \
- x(inode_points_to_missing_dirent, 249) \
- x(inode_points_to_wrong_dirent, 250) \
- x(inode_bi_parent_nonzero, 251) \
- x(dirent_to_missing_parent_subvol, 252) \
- x(dirent_not_visible_in_parent_subvol, 253) \
- x(subvol_fs_path_parent_wrong, 254) \
- x(subvol_root_fs_path_parent_nonzero, 255) \
- x(subvol_children_not_set, 256) \
- x(subvol_children_bad, 257) \
- x(subvol_loop, 258) \
- x(subvol_unreachable, 259) \
- x(btree_node_bkey_bad_u64s, 260) \
- x(btree_node_topology_empty_interior_node, 261) \
- x(btree_ptr_v2_min_key_bad, 262) \
- x(btree_root_unreadable_and_scan_found_nothing, 263) \
- x(snapshot_node_missing, 264) \
- x(dup_backpointer_to_bad_csum_extent, 265) \
- x(btree_bitmap_not_marked, 266) \
- x(sb_clean_entry_overrun, 267) \
- x(btree_ptr_v2_written_0, 268) \
- x(subvol_snapshot_bad, 269) \
- x(subvol_inode_bad, 270)
+enum bch_fsck_flags {
+ FSCK_CAN_FIX = 1 << 0,
+ FSCK_CAN_IGNORE = 1 << 1,
+ FSCK_NEED_FSCK = 1 << 2,
+ FSCK_NO_RATELIMIT = 1 << 3,
+ FSCK_AUTOFIX = 1 << 4,
+};
+
+#define BCH_SB_ERRS() \
+ x(clean_but_journal_not_empty, 0, 0) \
+ x(dirty_but_no_journal_entries, 1, 0) \
+ x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \
+ x(sb_clean_journal_seq_mismatch, 3, 0) \
+ x(sb_clean_btree_root_mismatch, 4, 0) \
+ x(sb_clean_missing, 5, 0) \
+ x(jset_unsupported_version, 6, 0) \
+ x(jset_unknown_csum, 7, 0) \
+ x(jset_last_seq_newer_than_seq, 8, 0) \
+ x(jset_past_bucket_end, 9, 0) \
+ x(jset_seq_blacklisted, 10, 0) \
+ x(journal_entries_missing, 11, 0) \
+ x(journal_entry_replicas_not_marked, 12, 0) \
+ x(journal_entry_past_jset_end, 13, 0) \
+ x(journal_entry_replicas_data_mismatch, 14, 0) \
+ x(journal_entry_bkey_u64s_0, 15, 0) \
+ x(journal_entry_bkey_past_end, 16, 0) \
+ x(journal_entry_bkey_bad_format, 17, 0) \
+ x(journal_entry_bkey_invalid, 18, 0) \
+ x(journal_entry_btree_root_bad_size, 19, 0) \
+ x(journal_entry_blacklist_bad_size, 20, 0) \
+ x(journal_entry_blacklist_v2_bad_size, 21, 0) \
+ x(journal_entry_blacklist_v2_start_past_end, 22, 0) \
+ x(journal_entry_usage_bad_size, 23, 0) \
+ x(journal_entry_data_usage_bad_size, 24, 0) \
+ x(journal_entry_clock_bad_size, 25, 0) \
+ x(journal_entry_clock_bad_rw, 26, 0) \
+ x(journal_entry_dev_usage_bad_size, 27, 0) \
+ x(journal_entry_dev_usage_bad_dev, 28, 0) \
+ x(journal_entry_dev_usage_bad_pad, 29, 0) \
+ x(btree_node_unreadable, 30, 0) \
+ x(btree_node_fault_injected, 31, 0) \
+ x(btree_node_bad_magic, 32, 0) \
+ x(btree_node_bad_seq, 33, 0) \
+ x(btree_node_unsupported_version, 34, 0) \
+ x(btree_node_bset_older_than_sb_min, 35, 0) \
+ x(btree_node_bset_newer_than_sb, 36, 0) \
+ x(btree_node_data_missing, 37, 0) \
+ x(btree_node_bset_after_end, 38, 0) \
+ x(btree_node_replicas_sectors_written_mismatch, 39, 0) \
+ x(btree_node_replicas_data_mismatch, 40, 0) \
+ x(bset_unknown_csum, 41, 0) \
+ x(bset_bad_csum, 42, 0) \
+ x(bset_past_end_of_btree_node, 43, 0) \
+ x(bset_wrong_sector_offset, 44, 0) \
+ x(bset_empty, 45, 0) \
+ x(bset_bad_seq, 46, 0) \
+ x(bset_blacklisted_journal_seq, 47, 0) \
+ x(first_bset_blacklisted_journal_seq, 48, 0) \
+ x(btree_node_bad_btree, 49, 0) \
+ x(btree_node_bad_level, 50, 0) \
+ x(btree_node_bad_min_key, 51, 0) \
+ x(btree_node_bad_max_key, 52, 0) \
+ x(btree_node_bad_format, 53, 0) \
+ x(btree_node_bkey_past_bset_end, 54, 0) \
+ x(btree_node_bkey_bad_format, 55, 0) \
+ x(btree_node_bad_bkey, 56, 0) \
+ x(btree_node_bkey_out_of_order, 57, 0) \
+ x(btree_root_bkey_invalid, 58, 0) \
+ x(btree_root_read_error, 59, 0) \
+ x(btree_root_bad_min_key, 60, 0) \
+ x(btree_root_bad_max_key, 61, 0) \
+ x(btree_node_read_error, 62, 0) \
+ x(btree_node_topology_bad_min_key, 63, 0) \
+ x(btree_node_topology_bad_max_key, 64, 0) \
+ x(btree_node_topology_overwritten_by_prev_node, 65, 0) \
+ x(btree_node_topology_overwritten_by_next_node, 66, 0) \
+ x(btree_node_topology_interior_node_empty, 67, 0) \
+ x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \
+ x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \
+ x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \
+ x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \
+ x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \
+ x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \
+ x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \
+ x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \
+ x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \
+ x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \
+ x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \
+ x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \
+ x(bkey_version_in_future, 80, 0) \
+ x(bkey_u64s_too_small, 81, 0) \
+ x(bkey_invalid_type_for_btree, 82, 0) \
+ x(bkey_extent_size_zero, 83, 0) \
+ x(bkey_extent_size_greater_than_offset, 84, 0) \
+ x(bkey_size_nonzero, 85, 0) \
+ x(bkey_snapshot_nonzero, 86, 0) \
+ x(bkey_snapshot_zero, 87, 0) \
+ x(bkey_at_pos_max, 88, 0) \
+ x(bkey_before_start_of_btree_node, 89, 0) \
+ x(bkey_after_end_of_btree_node, 90, 0) \
+ x(bkey_val_size_nonzero, 91, 0) \
+ x(bkey_val_size_too_small, 92, 0) \
+ x(alloc_v1_val_size_bad, 93, 0) \
+ x(alloc_v2_unpack_error, 94, 0) \
+ x(alloc_v3_unpack_error, 95, 0) \
+ x(alloc_v4_val_size_bad, 96, 0) \
+ x(alloc_v4_backpointers_start_bad, 97, 0) \
+ x(alloc_key_data_type_bad, 98, 0) \
+ x(alloc_key_empty_but_have_data, 99, 0) \
+ x(alloc_key_dirty_sectors_0, 100, 0) \
+ x(alloc_key_data_type_inconsistency, 101, 0) \
+ x(alloc_key_to_missing_dev_bucket, 102, 0) \
+ x(alloc_key_cached_inconsistency, 103, 0) \
+ x(alloc_key_cached_but_read_time_zero, 104, 0) \
+ x(alloc_key_to_missing_lru_entry, 105, 0) \
+ x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \
+ x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \
+ x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \
+ x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \
+ x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \
+ x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \
+ x(bucket_sector_count_overflow, 112, 0) \
+ x(bucket_metadata_type_mismatch, 113, 0) \
+ x(need_discard_key_wrong, 114, 0) \
+ x(freespace_key_wrong, 115, 0) \
+ x(freespace_hole_missing, 116, 0) \
+ x(bucket_gens_val_size_bad, 117, 0) \
+ x(bucket_gens_key_wrong, 118, 0) \
+ x(bucket_gens_hole_wrong, 119, 0) \
+ x(bucket_gens_to_invalid_dev, 120, 0) \
+ x(bucket_gens_to_invalid_buckets, 121, 0) \
+ x(bucket_gens_nonzero_for_invalid_buckets, 122, 0) \
+ x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \
+ x(need_discard_freespace_key_bad, 124, 0) \
+ x(backpointer_bucket_offset_wrong, 125, 0) \
+ x(backpointer_to_missing_device, 126, 0) \
+ x(backpointer_to_missing_alloc, 127, 0) \
+ x(backpointer_to_missing_ptr, 128, 0) \
+ x(lru_entry_at_time_0, 129, 0) \
+ x(lru_entry_to_invalid_bucket, 130, 0) \
+ x(lru_entry_bad, 131, 0) \
+ x(btree_ptr_val_too_big, 132, 0) \
+ x(btree_ptr_v2_val_too_big, 133, 0) \
+ x(btree_ptr_has_non_ptr, 134, 0) \
+ x(extent_ptrs_invalid_entry, 135, 0) \
+ x(extent_ptrs_no_ptrs, 136, 0) \
+ x(extent_ptrs_too_many_ptrs, 137, 0) \
+ x(extent_ptrs_redundant_crc, 138, 0) \
+ x(extent_ptrs_redundant_stripe, 139, 0) \
+ x(extent_ptrs_unwritten, 140, 0) \
+ x(extent_ptrs_written_and_unwritten, 141, 0) \
+ x(ptr_to_invalid_device, 142, 0) \
+ x(ptr_to_duplicate_device, 143, 0) \
+ x(ptr_after_last_bucket, 144, 0) \
+ x(ptr_before_first_bucket, 145, 0) \
+ x(ptr_spans_multiple_buckets, 146, 0) \
+ x(ptr_to_missing_backpointer, 147, 0) \
+ x(ptr_to_missing_alloc_key, 148, 0) \
+ x(ptr_to_missing_replicas_entry, 149, 0) \
+ x(ptr_to_missing_stripe, 150, 0) \
+ x(ptr_to_incorrect_stripe, 151, 0) \
+ x(ptr_gen_newer_than_bucket_gen, 152, 0) \
+ x(ptr_too_stale, 153, 0) \
+ x(stale_dirty_ptr, 154, 0) \
+ x(ptr_bucket_data_type_mismatch, 155, 0) \
+ x(ptr_cached_and_erasure_coded, 156, 0) \
+ x(ptr_crc_uncompressed_size_too_small, 157, 0) \
+ x(ptr_crc_csum_type_unknown, 158, 0) \
+ x(ptr_crc_compression_type_unknown, 159, 0) \
+ x(ptr_crc_redundant, 160, 0) \
+ x(ptr_crc_uncompressed_size_too_big, 161, 0) \
+ x(ptr_crc_nonce_mismatch, 162, 0) \
+ x(ptr_stripe_redundant, 163, 0) \
+ x(reservation_key_nr_replicas_invalid, 164, 0) \
+ x(reflink_v_refcount_wrong, 165, 0) \
+ x(reflink_p_to_missing_reflink_v, 166, 0) \
+ x(stripe_pos_bad, 167, 0) \
+ x(stripe_val_size_bad, 168, 0) \
+ x(stripe_sector_count_wrong, 169, 0) \
+ x(snapshot_tree_pos_bad, 170, 0) \
+ x(snapshot_tree_to_missing_snapshot, 171, 0) \
+ x(snapshot_tree_to_missing_subvol, 172, 0) \
+ x(snapshot_tree_to_wrong_subvol, 173, 0) \
+ x(snapshot_tree_to_snapshot_subvol, 174, 0) \
+ x(snapshot_pos_bad, 175, 0) \
+ x(snapshot_parent_bad, 176, 0) \
+ x(snapshot_children_not_normalized, 177, 0) \
+ x(snapshot_child_duplicate, 178, 0) \
+ x(snapshot_child_bad, 179, 0) \
+ x(snapshot_skiplist_not_normalized, 180, 0) \
+ x(snapshot_skiplist_bad, 181, 0) \
+ x(snapshot_should_not_have_subvol, 182, 0) \
+ x(snapshot_to_bad_snapshot_tree, 183, 0) \
+ x(snapshot_bad_depth, 184, 0) \
+ x(snapshot_bad_skiplist, 185, 0) \
+ x(subvol_pos_bad, 186, 0) \
+ x(subvol_not_master_and_not_snapshot, 187, 0) \
+ x(subvol_to_missing_root, 188, 0) \
+ x(subvol_root_wrong_bi_subvol, 189, 0) \
+ x(bkey_in_missing_snapshot, 190, 0) \
+ x(inode_pos_inode_nonzero, 191, 0) \
+ x(inode_pos_blockdev_range, 192, 0) \
+ x(inode_unpack_error, 193, 0) \
+ x(inode_str_hash_invalid, 194, 0) \
+ x(inode_v3_fields_start_bad, 195, 0) \
+ x(inode_snapshot_mismatch, 196, 0) \
+ x(inode_unlinked_but_clean, 197, 0) \
+ x(inode_unlinked_but_nlink_nonzero, 198, 0) \
+ x(inode_checksum_type_invalid, 199, 0) \
+ x(inode_compression_type_invalid, 200, 0) \
+ x(inode_subvol_root_but_not_dir, 201, 0) \
+ x(inode_i_size_dirty_but_clean, 202, 0) \
+ x(inode_i_sectors_dirty_but_clean, 203, 0) \
+ x(inode_i_sectors_wrong, 204, 0) \
+ x(inode_dir_wrong_nlink, 205, 0) \
+ x(inode_dir_multiple_links, 206, 0) \
+ x(inode_multiple_links_but_nlink_0, 207, 0) \
+ x(inode_wrong_backpointer, 208, 0) \
+ x(inode_wrong_nlink, 209, 0) \
+ x(inode_unreachable, 210, 0) \
+ x(deleted_inode_but_clean, 211, 0) \
+ x(deleted_inode_missing, 212, 0) \
+ x(deleted_inode_is_dir, 213, 0) \
+ x(deleted_inode_not_unlinked, 214, 0) \
+ x(extent_overlapping, 215, 0) \
+ x(extent_in_missing_inode, 216, 0) \
+ x(extent_in_non_reg_inode, 217, 0) \
+ x(extent_past_end_of_inode, 218, 0) \
+ x(dirent_empty_name, 219, 0) \
+ x(dirent_val_too_big, 220, 0) \
+ x(dirent_name_too_long, 221, 0) \
+ x(dirent_name_embedded_nul, 222, 0) \
+ x(dirent_name_dot_or_dotdot, 223, 0) \
+ x(dirent_name_has_slash, 224, 0) \
+ x(dirent_d_type_wrong, 225, 0) \
+ x(inode_bi_parent_wrong, 226, 0) \
+ x(dirent_in_missing_dir_inode, 227, 0) \
+ x(dirent_in_non_dir_inode, 228, 0) \
+ x(dirent_to_missing_inode, 229, 0) \
+ x(dirent_to_missing_subvol, 230, 0) \
+ x(dirent_to_itself, 231, 0) \
+ x(quota_type_invalid, 232, 0) \
+ x(xattr_val_size_too_small, 233, 0) \
+ x(xattr_val_size_too_big, 234, 0) \
+ x(xattr_invalid_type, 235, 0) \
+ x(xattr_name_invalid_chars, 236, 0) \
+ x(xattr_in_missing_inode, 237, 0) \
+ x(root_subvol_missing, 238, 0) \
+ x(root_dir_missing, 239, 0) \
+ x(root_inode_not_dir, 240, 0) \
+ x(dir_loop, 241, 0) \
+ x(hash_table_key_duplicate, 242, 0) \
+ x(hash_table_key_wrong_offset, 243, 0) \
+ x(unlinked_inode_not_on_deleted_list, 244, 0) \
+ x(reflink_p_front_pad_bad, 245, 0) \
+ x(journal_entry_dup_same_device, 246, 0) \
+ x(inode_bi_subvol_missing, 247, 0) \
+ x(inode_bi_subvol_wrong, 248, 0) \
+ x(inode_points_to_missing_dirent, 249, 0) \
+ x(inode_points_to_wrong_dirent, 250, 0) \
+ x(inode_bi_parent_nonzero, 251, 0) \
+ x(dirent_to_missing_parent_subvol, 252, 0) \
+ x(dirent_not_visible_in_parent_subvol, 253, 0) \
+ x(subvol_fs_path_parent_wrong, 254, 0) \
+ x(subvol_root_fs_path_parent_nonzero, 255, 0) \
+ x(subvol_children_not_set, 256, 0) \
+ x(subvol_children_bad, 257, 0) \
+ x(subvol_loop, 258, 0) \
+ x(subvol_unreachable, 259, 0) \
+ x(btree_node_bkey_bad_u64s, 260, 0) \
+ x(btree_node_topology_empty_interior_node, 261, 0) \
+ x(btree_ptr_v2_min_key_bad, 262, 0) \
+ x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
+ x(snapshot_node_missing, 264, 0) \
+ x(dup_backpointer_to_bad_csum_extent, 265, 0) \
+ x(btree_bitmap_not_marked, 266, 0) \
+ x(sb_clean_entry_overrun, 267, 0) \
+ x(btree_ptr_v2_written_0, 268, 0) \
+ x(subvol_snapshot_bad, 269, 0) \
+ x(subvol_inode_bad, 270, 0) \
+ x(alloc_key_stripe_sectors_wrong, 271, 0) \
+ x(accounting_mismatch, 272, 0) \
+ x(accounting_replicas_not_marked, 273, 0) \
+ x(invalid_btree_id, 274, 0) \
+ x(alloc_key_io_time_bad, 275, 0)
enum bch_sb_error_id {
-#define x(t, n) BCH_FSCK_ERR_##t = n,
+#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
BCH_SB_ERRS()
#undef x
BCH_SB_ERR_MAX
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
index c1860d8163fb..c4b3d8d3f414 100644
--- a/fs/bcachefs/seqmutex.h
+++ b/fs/bcachefs/seqmutex.h
@@ -19,17 +19,14 @@ static inline bool seqmutex_trylock(struct seqmutex *lock)
static inline void seqmutex_lock(struct seqmutex *lock)
{
mutex_lock(&lock->lock);
-}
-
-static inline void seqmutex_unlock(struct seqmutex *lock)
-{
lock->seq++;
- mutex_unlock(&lock->lock);
}
-static inline u32 seqmutex_seq(struct seqmutex *lock)
+static inline u32 seqmutex_unlock(struct seqmutex *lock)
{
- return lock->seq;
+ u32 seq = lock->seq;
+ mutex_unlock(&lock->lock);
+ return seq;
}
static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 51918acfd726..24023d6a9698 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -168,6 +168,9 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
+ if (unlikely(new_bytes > INT_MAX))
+ return NULL;
+
new = kvzalloc(new_bytes, GFP_KERNEL);
if (!new)
return NULL;
@@ -1565,13 +1568,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
return 0;
- if (!test_bit(BCH_FS_started, &c->flags)) {
- ret = bch2_fs_read_write_early(c);
- bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
- if (ret)
- return ret;
- }
-
trans = bch2_trans_get(c);
/*
@@ -1687,6 +1683,8 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+ set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
+
bch2_delete_dead_snapshots(c);
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index cbad9b27874f..c8c266cb5797 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -300,7 +300,7 @@ not_found:
if (!found && (flags & STR_HASH_must_replace)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
} else if (found && (flags & STR_HASH_must_create)) {
- ret = -EEXIST;
+ ret = -BCH_ERR_EEXIST_str_hash_set;
} else {
if (!found && slot.path)
swap(iter, slot);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index d73a0222f709..b156fc85b8a3 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -649,9 +649,10 @@ reread:
bytes = vstruct_bytes(sb->sb);
- if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) {
- prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
- bytes, 512UL << sb->sb->layout.sb_max_size_bits);
+ u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits);
+ if (bytes > sb_size) {
+ prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)",
+ bytes, sb_size);
return -BCH_ERR_invalid_sb_too_big;
}
@@ -1310,15 +1311,15 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
- prt_str(out, "Label:\t");
+ prt_printf(out, "Label:\t");
prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
prt_newline(out);
- prt_str(out, "Version:\t");
+ prt_printf(out, "Version:\t");
bch2_version_to_text(out, le16_to_cpu(sb->version));
prt_newline(out);
- prt_str(out, "Version upgrade complete:\t");
+ prt_printf(out, "Version upgrade complete:\t");
bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
prt_newline(out);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index df2bea38e83f..fb906467201e 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -536,7 +536,6 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
- bch2_fs_allocator_background_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
@@ -582,8 +581,10 @@ static void __bch2_fs_free(struct bch_fs *c)
if (c->write_ref_wq)
destroy_workqueue(c->write_ref_wq);
- if (c->io_complete_wq)
- destroy_workqueue(c->io_complete_wq);
+ if (c->btree_write_submit_wq)
+ destroy_workqueue(c->btree_write_submit_wq);
+ if (c->btree_read_complete_wq)
+ destroy_workqueue(c->btree_read_complete_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->btree_io_complete_wq)
@@ -878,8 +879,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
+ !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
+ !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG
@@ -908,9 +911,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_journal_init(&c->journal) ?:
bch2_fs_replicas_init(c) ?:
+ bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_cache_init(c) ?:
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
- bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_interior_update_init(c) ?:
bch2_fs_buckets_waiting_for_journal_init(c) ?:
bch2_fs_btree_write_buffer_init(c) ?:
@@ -927,12 +930,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
- for (i = 0; i < c->sb.nr_devices; i++)
- if (bch2_member_exists(c->disk_sb.sb, i) &&
- bch2_dev_alloc(c, i)) {
- ret = -EEXIST;
+ for (i = 0; i < c->sb.nr_devices; i++) {
+ if (!bch2_member_exists(c->disk_sb.sb, i))
+ continue;
+ ret = bch2_dev_alloc(c, i);
+ if (ret)
goto err;
- }
+ }
bch2_journal_entry_res_resize(&c->journal,
&c->btree_root_journal_res,
@@ -1190,6 +1194,7 @@ static void bch2_dev_free(struct bch_dev *ca)
kfree(ca->buckets_nouse);
bch2_free_super(&ca->disk_sb);
+ bch2_dev_allocator_background_exit(ca);
bch2_dev_journal_exit(ca);
free_percpu(ca->io_done);
@@ -1312,6 +1317,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
atomic_long_set(&ca->ref, 1);
#endif
+ bch2_dev_allocator_background_init(ca);
+
if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
@@ -1524,6 +1531,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
* The allocator thread itself allocates btree nodes, so stop it first:
*/
bch2_dev_allocator_remove(c, ca);
+ bch2_recalc_capacity(c);
bch2_dev_journal_stop(&c->journal, ca);
}
@@ -1535,6 +1543,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+ bch2_dev_do_discards(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 477f350a8bd0..e3a57196b0ee 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -741,7 +741,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
ret = btrfs_bio_csum(bbio);
if (ret)
goto fail_put_bio;
- } else if (use_append) {
+ } else if (use_append ||
+ (btrfs_is_zoned(fs_info) && inode &&
+ inode->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_alloc_dummy_sum(bbio);
if (ret)
goto fail_put_bio;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 1e09aeea69c2..60066822b532 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1785,6 +1785,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
struct btrfs_block_group *bg;
struct btrfs_space_info *space_info;
+ LIST_HEAD(retry_list);
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
@@ -1921,8 +1922,20 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
next:
- if (ret)
- btrfs_mark_bg_to_reclaim(bg);
+ if (ret) {
+ /* Refcount held by the reclaim_bgs list after splice. */
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * This block group might be added to the unused list
+ * during the above process. Move it back to the
+ * reclaim list otherwise.
+ */
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ list_add_tail(&bg->bg_list, &retry_list);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ }
btrfs_put_block_group(bg);
mutex_unlock(&fs_info->reclaim_bgs_lock);
@@ -1942,6 +1955,9 @@ next:
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
end:
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
btrfs_exclop_finish(fs_info);
sb_end_write(fs_info->sb);
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 91c994b569f3..6ed495ca7a31 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -89,6 +89,16 @@ enum {
BTRFS_INODE_FREE_SPACE_INODE,
/* Set when there are no capabilities in XATTs for the inode. */
BTRFS_INODE_NO_CAP_XATTR,
+ /*
+ * Set if an error happened when doing a COW write before submitting a
+ * bio or during writeback. Used for both buffered writes and direct IO
+ * writes. This is to signal a fast fsync that it has to wait for
+ * ordered extents to complete and therefore not log extent maps that
+ * point to unwritten extents (when an ordered extent completes and it
+ * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its
+ * range).
+ */
+ BTRFS_INODE_COW_WRITE_ERROR,
};
/* in memory btrfs inode */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1b20b3e390df..38cdb8875e8e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4538,18 +4538,10 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
- struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
struct btrfs_delayed_ref_node *ref;
- delayed_refs = &trans->delayed_refs;
-
spin_lock(&delayed_refs->lock);
- if (atomic_read(&delayed_refs->num_entries) == 0) {
- spin_unlock(&delayed_refs->lock);
- btrfs_debug(fs_info, "delayed_refs has NO entry");
- return;
- }
-
while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 597387e9f040..958155cc43a8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3553,7 +3553,7 @@ err:
for (int i = 0; i < num_folios; i++) {
if (eb->folios[i]) {
detach_extent_buffer_folio(eb, eb->folios[i]);
- __folio_put(eb->folios[i]);
+ folio_put(eb->folios[i]);
}
}
__free_extent_buffer(eb);
@@ -3689,6 +3689,8 @@ static struct extent_buffer *grab_extent_buffer(
struct folio *folio = page_folio(page);
struct extent_buffer *exists;
+ lockdep_assert_held(&page->mapping->i_private_lock);
+
/*
* For subpage case, we completely rely on radix tree to ensure we
* don't try to insert two ebs for the same bytenr. So here we always
@@ -3756,13 +3758,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
* The caller needs to free the existing folios and retry using the same order.
*/
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
+ struct btrfs_subpage *prealloc,
struct extent_buffer **found_eb_ret)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
const unsigned long index = eb->start >> PAGE_SHIFT;
- struct folio *existing_folio;
+ struct folio *existing_folio = NULL;
int ret;
ASSERT(found_eb_ret);
@@ -3774,12 +3777,14 @@ retry:
ret = filemap_add_folio(mapping, eb->folios[i], index + i,
GFP_NOFS | __GFP_NOFAIL);
if (!ret)
- return 0;
+ goto finish;
existing_folio = filemap_lock_folio(mapping, index + i);
/* The page cache only exists for a very short time, just retry. */
- if (IS_ERR(existing_folio))
+ if (IS_ERR(existing_folio)) {
+ existing_folio = NULL;
goto retry;
+ }
/* For now, we should only have single-page folios for btree inode. */
ASSERT(folio_nr_pages(existing_folio) == 1);
@@ -3790,14 +3795,13 @@ retry:
return -EAGAIN;
}
- if (fs_info->nodesize < PAGE_SIZE) {
- /*
- * We're going to reuse the existing page, can drop our page
- * and subpage structure now.
- */
+finish:
+ spin_lock(&mapping->i_private_lock);
+ if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
+ /* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
- } else {
+ } else if (existing_folio) {
struct extent_buffer *existing_eb;
existing_eb = grab_extent_buffer(fs_info,
@@ -3805,6 +3809,7 @@ retry:
if (existing_eb) {
/* The extent buffer still exists, we can use it directly. */
*found_eb_ret = existing_eb;
+ spin_unlock(&mapping->i_private_lock);
folio_unlock(existing_folio);
folio_put(existing_folio);
return 1;
@@ -3813,6 +3818,22 @@ retry:
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
}
+ eb->folio_size = folio_size(eb->folios[i]);
+ eb->folio_shift = folio_shift(eb->folios[i]);
+ /* Should not fail, as we have preallocated the memory. */
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
+ ASSERT(!ret);
+ /*
+ * To inform we have an extra eb under allocation, so that
+ * detach_extent_buffer_page() won't release the folio private when the
+ * eb hasn't been inserted into radix tree yet.
+ *
+ * The ref will be decreased when the eb releases the page, in
+ * detach_extent_buffer_page(). Thus needs no special handling in the
+ * error path.
+ */
+ btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
@@ -3824,7 +3845,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
- struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
bool page_contig = true;
@@ -3890,7 +3910,7 @@ reallocate:
for (int i = 0; i < num_folios; i++) {
struct folio *folio;
- ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
+ ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
if (ret > 0) {
ASSERT(existing_eb);
goto out;
@@ -3927,24 +3947,6 @@ reallocate:
* and free the allocated page.
*/
folio = eb->folios[i];
- eb->folio_size = folio_size(folio);
- eb->folio_shift = folio_shift(folio);
- spin_lock(&mapping->i_private_lock);
- /* Should not fail, as we have preallocated the memory */
- ret = attach_extent_buffer_folio(eb, folio, prealloc);
- ASSERT(!ret);
- /*
- * To inform we have extra eb under allocation, so that
- * detach_extent_buffer_page() won't release the folio private
- * when the eb hasn't yet been inserted into radix tree.
- *
- * The ref will be decreased when the eb released the page, in
- * detach_extent_buffer_page().
- * Thus needs no special handling in error path.
- */
- btrfs_folio_inc_eb_refs(fs_info, folio);
- spin_unlock(&mapping->i_private_lock);
-
WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
/*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e764ac3f22e2..d90138683a0a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1885,6 +1885,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
+ clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags);
} else {
/*
* Get our ordered extents as soon as possible to avoid doing
@@ -1894,6 +1895,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
&ctx.ordered_extents);
ret = filemap_fdatawait_range(inode->i_mapping, start, end);
+ if (ret)
+ goto out_release_extents;
+
+ /*
+ * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
+ * starting and waiting for writeback, because for buffered IO
+ * it may have been set during the end IO callback
+ * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
+ * case an error happened and we need to wait for ordered
+ * extents to complete so that any extent maps that point to
+ * unwritten locations are dropped and we don't log them.
+ */
+ if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = btrfs_wait_ordered_range(inode, start, len);
}
if (ret)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3ab8dea5036b..dabc3d0793cf 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2697,7 +2697,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
int bg_reclaim_threshold = 0;
- bool initial = (size == block_group->length);
+ bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0));
u64 reclaimable_unusable;
WARN_ON(!initial && offset + size > block_group->zone_capacity);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 753db965f7c0..3a2b902b2d1f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10385,7 +10385,7 @@ out_unlock:
out_folios:
for (i = 0; i < nr_folios; i++) {
if (folios[i])
- __folio_put(folios[i]);
+ folio_put(folios[i]);
}
kvfree(folios);
out:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index c5bdd674f55c..35a413ce935d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -388,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ /*
+ * If this is a COW write it means we created new extent maps for the
+ * range and they point to unwritten locations if we got an error either
+ * before submitting a bio or during IO.
+ *
+ * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
+ * are queuing its completion below. During completion, at
+ * btrfs_finish_one_ordered(), we will drop the extent maps for the
+ * unwritten extents.
+ *
+ * However because completion runs in a work queue we can end up having
+ * a fast fsync running before that. In the case of direct IO, once we
+ * unlock the inode the fsync might start, and we queue the completion
+ * before unlocking the inode. In the case of buffered IO when writeback
+ * finishes (end_bbio_data_write()) we queue the completion, so if the
+ * writeback was triggered by a fast fsync, the fsync might start
+ * logging before ordered extent completion runs in the work queue.
+ *
+ * The fast fsync will log file extent items based on the extent maps it
+ * finds, so if by the time it collects extent maps the ordered extent
+ * completion didn't happen yet, it will log file extent items that
+ * point to unwritten extents, resulting in a corruption if a crash
+ * happens and the log tree is replayed. Note that a fast fsync does not
+ * wait for completion of ordered extents in order to reduce latency.
+ *
+ * Set a flag in the inode so that the next fast fsync will wait for
+ * ordered extents to complete before starting to log.
+ */
+ if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
+
if (ret)
btrfs_queue_ordered_fn(ordered);
return ret;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fc2a7ea26354..39a15cca58ca 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1351,7 +1351,7 @@ static int flush_reservations(struct btrfs_fs_info *fs_info)
int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *quota_root;
+ struct btrfs_root *quota_root = NULL;
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
@@ -1449,9 +1449,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
quota_root->node, 0, 1);
- btrfs_put_root(quota_root);
out:
+ btrfs_put_root(quota_root);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (ret && trans)
btrfs_end_transaction(trans);
@@ -3062,8 +3062,6 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_inherit *inherit,
size_t size)
{
- if (!btrfs_qgroup_enabled(fs_info))
- return 0;
if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
return -EOPNOTSUPP;
if (size < sizeof(*inherit) || size > PAGE_SIZE)
@@ -3085,6 +3083,14 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
return -EINVAL;
/*
+ * Skip the inherit source qgroups check if qgroup is not enabled.
+ * Qgroup can still be later enabled causing problems, but in that case
+ * btrfs_qgroup_inherit() would just ignore those invalid ones.
+ */
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
+
+ /*
* Now check all the remaining qgroups, they should all:
*
* - Exist
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index cf531255ab76..9522a8b79d22 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
u32 item_size = btrfs_item_size(leaf, slot);
unsigned long end, ptr;
u64 offset, flags, count;
- int type, ret;
+ int type;
+ int ret = 0;
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
@@ -486,7 +487,11 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
key->objectid, key->offset);
break;
case BTRFS_EXTENT_OWNER_REF_KEY:
- WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) {
+ btrfs_err(fs_info,
+ "found extent owner ref without simple quotas enabled");
+ ret = -EINVAL;
+ }
break;
default:
btrfs_err(fs_info, "invalid key type in iref");
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index afd6932f5e89..d7caa3732f07 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1688,20 +1688,24 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
(i << fs_info->sectorsize_bits);
int err;
- bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
- fs_info, scrub_read_endio, stripe);
- bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
-
io_stripe.is_scrub = true;
+ stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
+ /*
+ * For RST cases, we need to manually split the bbio to
+ * follow the RST boundary.
+ */
err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &stripe_len, &bioc, &io_stripe,
- &mirror);
+ &stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc);
- if (err) {
- btrfs_bio_end_io(bbio,
- errno_to_blk_status(err));
- return;
+ if (err < 0) {
+ set_bit(i, &stripe->io_error_bitmap);
+ set_bit(i, &stripe->error_bitmap);
+ continue;
}
+
+ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+ fs_info, scrub_read_endio, stripe);
+ bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
}
__bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d620323d08ea..ae8c56442549 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -373,11 +373,18 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
* "optimal" chunk size based on the fs size. However when we actually
* allocate the chunk we will strip this down further, making it no more
* than 10% of the disk or 1G, whichever is smaller.
+ *
+ * On the zoned mode, we need to use zone_size (=
+ * data_sinfo->chunk_size) as it is.
*/
data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
- data_chunk_size = min(data_sinfo->chunk_size,
- mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
- data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+ if (!btrfs_is_zoned(fs_info)) {
+ data_chunk_size = min(data_sinfo->chunk_size,
+ mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+ data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+ } else {
+ data_chunk_size = data_sinfo->chunk_size;
+ }
/*
* Since data allocations immediately use block groups as part of the
@@ -405,6 +412,17 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
avail >>= 3;
else
avail >>= 1;
+
+ /*
+ * On the zoned mode, we always allocate one zone as one chunk.
+ * Returning non-zone size alingned bytes here will result in
+ * less pressure for the async metadata reclaim process, and it
+ * will over-commit too much leading to ENOSPC. Align down to the
+ * zone size to avoid that.
+ */
+ if (btrfs_is_zoned(fs_info))
+ avail = ALIGN_DOWN(avail, fs_info->zone_size);
+
return avail;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5146387b416b..0bce1d45e252 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -138,6 +138,25 @@ static void wait_log_commit(struct btrfs_root *root, int transid);
* and once to do all the other items.
*/
+static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+{
+ unsigned int nofs_flag;
+ struct inode *inode;
+
+ /*
+ * We're holding a transaction handle whether we are logging or
+ * replaying a log tree, so we must make sure NOFS semantics apply
+ * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
+ * to allocate an inode, which can recurse back into the filesystem and
+ * attempt a transaction commit, resulting in a deadlock.
+ */
+ nofs_flag = memalloc_nofs_save();
+ inode = btrfs_iget(root->fs_info->sb, objectid, root);
+ memalloc_nofs_restore(nofs_flag);
+
+ return inode;
+}
+
/*
* start a sub transaction and setup the log tree
* this increments the log tree writer count to make the people
@@ -600,7 +619,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
{
struct inode *inode;
- inode = btrfs_iget(root->fs_info->sb, objectid, root);
+ inode = btrfs_iget_logging(objectid, root);
if (IS_ERR(inode))
inode = NULL;
return inode;
@@ -4860,18 +4879,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
path->slots[0]++;
continue;
}
- if (!dropped_extents) {
- /*
- * Avoid logging extent items logged in past fsync calls
- * and leading to duplicate keys in the log tree.
- */
+ /*
+ * Avoid overlapping items in the log tree. The first time we
+ * get here, get rid of everything from a past fsync. After
+ * that, if the current extent starts before the end of the last
+ * extent we copied, truncate the last one. This can happen if
+ * an ordered extent completion modifies the subvolume tree
+ * while btrfs_next_leaf() has the tree unlocked.
+ */
+ if (!dropped_extents || key.offset < truncate_offset) {
ret = truncate_inode_items(trans, root->log_root, inode,
- truncate_offset,
+ min(key.offset, truncate_offset),
BTRFS_EXTENT_DATA_KEY);
if (ret)
goto out;
dropped_extents = true;
}
+ truncate_offset = btrfs_file_extent_end(path);
if (ins_nr == 0)
start_slot = slot;
ins_nr++;
@@ -5433,7 +5457,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = start_inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
LIST_HEAD(dir_list);
struct btrfs_dir_list *dir_elem;
@@ -5494,7 +5517,7 @@ again:
continue;
btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
+ di_inode = btrfs_iget_logging(di_key.objectid, root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
goto out;
@@ -5554,7 +5577,7 @@ again:
btrfs_add_delayed_iput(curr_inode);
curr_inode = NULL;
- vfs_inode = btrfs_iget(fs_info->sb, ino, root);
+ vfs_inode = btrfs_iget_logging(ino, root);
if (IS_ERR(vfs_inode)) {
ret = PTR_ERR(vfs_inode);
break;
@@ -5649,7 +5672,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
return BTRFS_LOG_FORCE_COMMIT;
- inode = btrfs_iget(root->fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
/*
* If the other inode that had a conflicting dir entry was deleted in
* the current transaction then we either:
@@ -5750,7 +5773,6 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
/*
@@ -5781,7 +5803,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
list_del(&curr->list);
kfree(curr);
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
@@ -5792,7 +5814,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
if (ret != -ENOENT)
break;
- inode = btrfs_iget(fs_info->sb, parent, root);
+ inode = btrfs_iget_logging(parent, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
break;
@@ -6314,7 +6336,6 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
const bool orig_log_new_dentries = ctx->log_new_dentries;
- struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_item *item;
int ret = 0;
@@ -6340,7 +6361,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
if (key.type == BTRFS_ROOT_ITEM_KEY)
continue;
- di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
+ di_inode = btrfs_iget_logging(key.objectid, inode->root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
break;
@@ -6724,7 +6745,6 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
@@ -6789,8 +6809,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
cur_offset = item_size;
}
- dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
- root);
+ dir_inode = btrfs_iget_logging(inode_key.objectid, root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
@@ -6852,7 +6871,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
while (true) {
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
int slot;
struct btrfs_key search_key;
@@ -6867,7 +6885,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
search_key.objectid = found_key.offset;
search_key.type = BTRFS_INODE_ITEM_KEY;
search_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 6465e2574230..06cdf1a8a16f 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -133,7 +133,7 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
return 0;
}
-static void cachefiles_flush_reqs(struct cachefiles_cache *cache)
+void cachefiles_flush_reqs(struct cachefiles_cache *cache)
{
struct xarray *xa = &cache->reqs;
struct cachefiles_req *req;
@@ -159,6 +159,7 @@ static void cachefiles_flush_reqs(struct cachefiles_cache *cache)
xa_for_each(xa, index, req) {
req->error = -EIO;
complete(&req->done);
+ __xa_erase(xa, index);
}
xa_unlock(xa);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index d33169f0018b..6845a90cdfcc 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -55,6 +55,7 @@ struct cachefiles_ondemand_info {
int ondemand_id;
enum cachefiles_object_state state;
struct cachefiles_object *object;
+ spinlock_t lock;
};
/*
@@ -138,6 +139,7 @@ static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
struct cachefiles_req {
struct cachefiles_object *object;
struct completion done;
+ refcount_t ref;
int error;
struct cachefiles_msg msg;
};
@@ -186,6 +188,7 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache,
* daemon.c
*/
extern const struct file_operations cachefiles_daemon_fops;
+extern void cachefiles_flush_reqs(struct cachefiles_cache *cache);
extern void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache);
extern void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache);
@@ -424,6 +427,8 @@ do { \
pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \
fscache_io_error((___cache)->cache); \
set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
+ if (cachefiles_in_ondemand_mode(___cache)) \
+ cachefiles_flush_reqs(___cache); \
} while (0)
#define cachefiles_io_error_obj(object, FMT, ...) \
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 4ba42f1fa3b4..bce005f2b456 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -1,22 +1,42 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-#include <linux/fdtable.h>
#include <linux/anon_inodes.h>
#include <linux/uio.h>
#include "internal.h"
+struct ondemand_anon_file {
+ struct file *file;
+ int fd;
+};
+
+static inline void cachefiles_req_put(struct cachefiles_req *req)
+{
+ if (refcount_dec_and_test(&req->ref))
+ kfree(req);
+}
+
static int cachefiles_ondemand_fd_release(struct inode *inode,
struct file *file)
{
struct cachefiles_object *object = file->private_data;
- struct cachefiles_cache *cache = object->volume->cache;
- struct cachefiles_ondemand_info *info = object->ondemand;
- int object_id = info->ondemand_id;
+ struct cachefiles_cache *cache;
+ struct cachefiles_ondemand_info *info;
+ int object_id;
struct cachefiles_req *req;
- XA_STATE(xas, &cache->reqs, 0);
+ XA_STATE(xas, NULL, 0);
+
+ if (!object)
+ return 0;
+
+ info = object->ondemand;
+ cache = object->volume->cache;
+ xas.xa = &cache->reqs;
xa_lock(&cache->reqs);
+ spin_lock(&info->lock);
+ object_id = info->ondemand_id;
info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
cachefiles_ondemand_set_object_close(object);
+ spin_unlock(&info->lock);
/* Only flush CACHEFILES_REQ_NEW marked req to avoid race with daemon_read */
xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
@@ -76,12 +96,12 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
}
static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
- unsigned long arg)
+ unsigned long id)
{
struct cachefiles_object *object = filp->private_data;
struct cachefiles_cache *cache = object->volume->cache;
struct cachefiles_req *req;
- unsigned long id;
+ XA_STATE(xas, &cache->reqs, id);
if (ioctl != CACHEFILES_IOC_READ_COMPLETE)
return -EINVAL;
@@ -89,10 +109,15 @@ static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
return -EOPNOTSUPP;
- id = arg;
- req = xa_erase(&cache->reqs, id);
- if (!req)
+ xa_lock(&cache->reqs);
+ req = xas_load(&xas);
+ if (!req || req->msg.opcode != CACHEFILES_OP_READ ||
+ req->object != object) {
+ xa_unlock(&cache->reqs);
return -EINVAL;
+ }
+ xas_store(&xas, NULL);
+ xa_unlock(&cache->reqs);
trace_cachefiles_ondemand_cread(object, id);
complete(&req->done);
@@ -116,10 +141,12 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
{
struct cachefiles_req *req;
struct fscache_cookie *cookie;
+ struct cachefiles_ondemand_info *info;
char *pid, *psize;
unsigned long id;
long size;
int ret;
+ XA_STATE(xas, &cache->reqs, 0);
if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
return -EOPNOTSUPP;
@@ -143,10 +170,18 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
if (ret)
return ret;
- req = xa_erase(&cache->reqs, id);
- if (!req)
+ xa_lock(&cache->reqs);
+ xas.xa_index = id;
+ req = xas_load(&xas);
+ if (!req || req->msg.opcode != CACHEFILES_OP_OPEN ||
+ !req->object->ondemand->ondemand_id) {
+ xa_unlock(&cache->reqs);
return -EINVAL;
+ }
+ xas_store(&xas, NULL);
+ xa_unlock(&cache->reqs);
+ info = req->object->ondemand;
/* fail OPEN request if copen format is invalid */
ret = kstrtol(psize, 0, &size);
if (ret) {
@@ -166,6 +201,32 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
goto out;
}
+ spin_lock(&info->lock);
+ /*
+ * The anonymous fd was closed before copen ? Fail the request.
+ *
+ * t1 | t2
+ * ---------------------------------------------------------
+ * cachefiles_ondemand_copen
+ * req = xa_erase(&cache->reqs, id)
+ * // Anon fd is maliciously closed.
+ * cachefiles_ondemand_fd_release
+ * xa_lock(&cache->reqs)
+ * cachefiles_ondemand_set_object_close(object)
+ * xa_unlock(&cache->reqs)
+ * cachefiles_ondemand_set_object_open
+ * // No one will ever close it again.
+ * cachefiles_ondemand_daemon_read
+ * cachefiles_ondemand_select_req
+ *
+ * Get a read req but its fd is already closed. The daemon can't
+ * issue a cread ioctl with an closed fd, then hung.
+ */
+ if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED) {
+ spin_unlock(&info->lock);
+ req->error = -EBADFD;
+ goto out;
+ }
cookie = req->object->cookie;
cookie->object_size = size;
if (size)
@@ -175,9 +236,15 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
trace_cachefiles_ondemand_copen(req->object, id, size);
cachefiles_ondemand_set_object_open(req->object);
+ spin_unlock(&info->lock);
wake_up_all(&cache->daemon_pollwq);
out:
+ spin_lock(&info->lock);
+ /* Need to set object close to avoid reopen status continuing */
+ if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED)
+ cachefiles_ondemand_set_object_close(req->object);
+ spin_unlock(&info->lock);
complete(&req->done);
return ret;
}
@@ -205,14 +272,14 @@ int cachefiles_ondemand_restore(struct cachefiles_cache *cache, char *args)
return 0;
}
-static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
+static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
+ struct ondemand_anon_file *anon_file)
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
struct cachefiles_open *load;
- struct file *file;
u32 object_id;
- int ret, fd;
+ int ret;
object = cachefiles_grab_object(req->object,
cachefiles_obj_get_ondemand_fd);
@@ -224,35 +291,53 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
if (ret < 0)
goto err;
- fd = get_unused_fd_flags(O_WRONLY);
- if (fd < 0) {
- ret = fd;
+ anon_file->fd = get_unused_fd_flags(O_WRONLY);
+ if (anon_file->fd < 0) {
+ ret = anon_file->fd;
goto err_free_id;
}
- file = anon_inode_getfile("[cachefiles]", &cachefiles_ondemand_fd_fops,
- object, O_WRONLY);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
+ anon_file->file = anon_inode_getfile("[cachefiles]",
+ &cachefiles_ondemand_fd_fops, object, O_WRONLY);
+ if (IS_ERR(anon_file->file)) {
+ ret = PTR_ERR(anon_file->file);
goto err_put_fd;
}
- file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
- fd_install(fd, file);
+ spin_lock(&object->ondemand->lock);
+ if (object->ondemand->ondemand_id > 0) {
+ spin_unlock(&object->ondemand->lock);
+ /* Pair with check in cachefiles_ondemand_fd_release(). */
+ anon_file->file->private_data = NULL;
+ ret = -EEXIST;
+ goto err_put_file;
+ }
+
+ anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
load = (void *)req->msg.data;
- load->fd = fd;
+ load->fd = anon_file->fd;
object->ondemand->ondemand_id = object_id;
+ spin_unlock(&object->ondemand->lock);
cachefiles_get_unbind_pincount(cache);
trace_cachefiles_ondemand_open(object, &req->msg, load);
return 0;
+err_put_file:
+ fput(anon_file->file);
+ anon_file->file = NULL;
err_put_fd:
- put_unused_fd(fd);
+ put_unused_fd(anon_file->fd);
+ anon_file->fd = ret;
err_free_id:
xa_erase(&cache->ondemand_ids, object_id);
err:
+ spin_lock(&object->ondemand->lock);
+ /* Avoid marking an opened object as closed. */
+ if (object->ondemand->ondemand_id <= 0)
+ cachefiles_ondemand_set_object_close(object);
+ spin_unlock(&object->ondemand->lock);
cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
return ret;
}
@@ -294,14 +379,28 @@ static struct cachefiles_req *cachefiles_ondemand_select_req(struct xa_state *xa
return NULL;
}
+static inline bool cachefiles_ondemand_finish_req(struct cachefiles_req *req,
+ struct xa_state *xas, int err)
+{
+ if (unlikely(!xas || !req))
+ return false;
+
+ if (xa_cmpxchg(xas->xa, xas->xa_index, req, NULL, 0) != req)
+ return false;
+
+ req->error = err;
+ complete(&req->done);
+ return true;
+}
+
ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
char __user *_buffer, size_t buflen)
{
struct cachefiles_req *req;
struct cachefiles_msg *msg;
- unsigned long id = 0;
size_t n;
int ret = 0;
+ struct ondemand_anon_file anon_file;
XA_STATE(xas, &cache->reqs, cache->req_id_next);
xa_lock(&cache->reqs);
@@ -330,42 +429,37 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
xas_clear_mark(&xas, CACHEFILES_REQ_NEW);
cache->req_id_next = xas.xa_index + 1;
+ refcount_inc(&req->ref);
+ cachefiles_grab_object(req->object, cachefiles_obj_get_read_req);
xa_unlock(&cache->reqs);
- id = xas.xa_index;
-
if (msg->opcode == CACHEFILES_OP_OPEN) {
- ret = cachefiles_ondemand_get_fd(req);
- if (ret) {
- cachefiles_ondemand_set_object_close(req->object);
- goto error;
- }
+ ret = cachefiles_ondemand_get_fd(req, &anon_file);
+ if (ret)
+ goto out;
}
- msg->msg_id = id;
+ msg->msg_id = xas.xa_index;
msg->object_id = req->object->ondemand->ondemand_id;
- if (copy_to_user(_buffer, msg, n) != 0) {
+ if (copy_to_user(_buffer, msg, n) != 0)
ret = -EFAULT;
- goto err_put_fd;
- }
- /* CLOSE request has no reply */
- if (msg->opcode == CACHEFILES_OP_CLOSE) {
- xa_erase(&cache->reqs, id);
- complete(&req->done);
+ if (msg->opcode == CACHEFILES_OP_OPEN) {
+ if (ret < 0) {
+ fput(anon_file.file);
+ put_unused_fd(anon_file.fd);
+ goto out;
+ }
+ fd_install(anon_file.fd, anon_file.file);
}
-
- return n;
-
-err_put_fd:
- if (msg->opcode == CACHEFILES_OP_OPEN)
- close_fd(((struct cachefiles_open *)msg->data)->fd);
-error:
- xa_erase(&cache->reqs, id);
- req->error = ret;
- complete(&req->done);
- return ret;
+out:
+ cachefiles_put_object(req->object, cachefiles_obj_put_read_req);
+ /* Remove error request and CLOSE request has no reply */
+ if (ret || msg->opcode == CACHEFILES_OP_CLOSE)
+ cachefiles_ondemand_finish_req(req, &xas, ret);
+ cachefiles_req_put(req);
+ return ret ? ret : n;
}
typedef int (*init_req_fn)(struct cachefiles_req *req, void *private);
@@ -395,6 +489,7 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
goto out;
}
+ refcount_set(&req->ref, 1);
req->object = object;
init_completion(&req->done);
req->msg.opcode = opcode;
@@ -454,9 +549,19 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
goto out;
wake_up_all(&cache->daemon_pollwq);
- wait_for_completion(&req->done);
- ret = req->error;
- kfree(req);
+wait:
+ ret = wait_for_completion_killable(&req->done);
+ if (!ret) {
+ ret = req->error;
+ } else {
+ ret = -EINTR;
+ if (!cachefiles_ondemand_finish_req(req, &xas, ret)) {
+ /* Someone will complete it soon. */
+ cpu_relax();
+ goto wait;
+ }
+ }
+ cachefiles_req_put(req);
return ret;
out:
/* Reset the object to close state in error handling path.
@@ -578,6 +683,7 @@ int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object,
return -ENOMEM;
object->ondemand->object = object;
+ spin_lock_init(&object->ondemand->lock);
INIT_WORK(&object->ondemand->ondemand_work, ondemand_object_worker);
return 0;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 407095188f83..d58dc9e58f3b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3029,28 +3029,25 @@ EXPORT_SYMBOL(d_splice_alias);
bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
- bool result;
+ bool subdir;
unsigned seq;
if (new_dentry == old_dentry)
return true;
- do {
- /* for restarting inner loop in case of seq retry */
- seq = read_seqbegin(&rename_lock);
- /*
- * Need rcu_readlock to protect against the d_parent trashing
- * due to d_move
- */
- rcu_read_lock();
- if (d_ancestor(old_dentry, new_dentry))
- result = true;
- else
- result = false;
- rcu_read_unlock();
- } while (read_seqretry(&rename_lock, seq));
-
- return result;
+ /* Access d_parent under rcu as d_move() may change it. */
+ rcu_read_lock();
+ seq = read_seqbegin(&rename_lock);
+ subdir = d_ancestor(old_dentry, new_dentry);
+ /* Try lockless once... */
+ if (read_seqretry(&rename_lock, seq)) {
+ /* ...else acquire lock for progress even on deep chains. */
+ read_seqlock_excl(&rename_lock);
+ subdir = d_ancestor(old_dentry, new_dentry);
+ read_sequnlock_excl(&rename_lock);
+ }
+ rcu_read_unlock();
+ return subdir;
}
EXPORT_SYMBOL(is_subdir);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index dc51df0b118d..8fd928899a59 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -107,8 +107,16 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param
int opt;
opt = fs_parse(fc, debugfs_param_specs, param, &result);
- if (opt < 0)
+ if (opt < 0) {
+ /*
+ * We might like to report bad mount options here; but
+ * traditionally debugfs has ignored all mount options
+ */
+ if (opt == -ENOPARAM)
+ return 0;
+
return opt;
+ }
switch (opt) {
case Opt_uid:
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index c93bd24d2771..1b91d9513013 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -343,7 +343,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->build_time = le64_to_cpu(dsb->build_time);
sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
- memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid));
+ super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
ret = strscpy(sbi->volume_name, dsb->volume_name,
sizeof(dsb->volume_name));
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 9b248ee5fef2..74d3d7bffcf3 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -711,6 +711,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
err = z_erofs_do_map_blocks(inode, map, flags);
out:
+ if (err)
+ map->m_llen = 0;
trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
return err;
}
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 036024bce9f7..b80f612867c2 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -148,7 +148,7 @@ int __init z_erofs_gbuf_init(void)
void z_erofs_gbuf_exit(void)
{
- int i;
+ int i, j;
for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) {
struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i];
@@ -161,9 +161,9 @@ void z_erofs_gbuf_exit(void)
if (!gbuf->pages)
continue;
- for (i = 0; i < gbuf->nrpages; ++i)
- if (gbuf->pages[i])
- put_page(gbuf->pages[i]);
+ for (j = 0; j < gbuf->nrpages; ++j)
+ if (gbuf->pages[j])
+ put_page(gbuf->pages[j]);
kfree(gbuf->pages);
gbuf->pages = NULL;
}
diff --git a/fs/file.c b/fs/file.c
index 8076aef9c210..a3b72aa64f11 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -486,12 +486,12 @@ struct files_struct init_files = {
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
- unsigned int maxfd = fdt->max_fds;
+ unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
- if (bitbit > maxfd)
+ if (bitbit >= maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index c5802a459334..d46558990279 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -241,6 +241,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
unsigned block_size = (1 << block_bits);
size_t poff = offset_in_folio(folio, *pos);
size_t plen = min_t(loff_t, folio_size(folio) - poff, length);
+ size_t orig_plen = plen;
unsigned first = poff >> block_bits;
unsigned last = (poff + plen - 1) >> block_bits;
@@ -277,7 +278,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
* handle both halves separately so that we properly zero data in the
* page cache for blocks that are entirely outside of i_size.
*/
- if (orig_pos <= isize && orig_pos + length > isize) {
+ if (orig_pos <= isize && orig_pos + orig_plen > isize) {
unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
if (first <= end && last > end)
@@ -877,22 +878,37 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t old_size = iter->inode->i_size;
+ size_t written;
if (srcmap->type == IOMAP_INLINE) {
iomap_write_end_inline(iter, folio, pos, copied);
- return true;
+ written = copied;
+ } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
+ written = block_write_end(NULL, iter->inode->i_mapping, pos,
+ len, copied, &folio->page, NULL);
+ WARN_ON_ONCE(written != copied && written != 0);
+ } else {
+ written = __iomap_write_end(iter->inode, pos, len, copied,
+ folio) ? copied : 0;
}
- if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
- size_t bh_written;
-
- bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
- len, copied, &folio->page, NULL);
- WARN_ON_ONCE(bh_written != copied && bh_written != 0);
- return bh_written == copied;
+ /*
+ * Update the in-memory inode size after copying the data into the page
+ * cache. It's up to the file system to write the updated size to disk,
+ * preferably after I/O completion so that no stale data is exposed.
+ * Only once that's done can we unlock and release the folio.
+ */
+ if (pos + written > old_size) {
+ i_size_write(iter->inode, pos + written);
+ iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
}
+ __iomap_put_folio(iter, pos, written, folio);
- return __iomap_write_end(iter->inode, pos, len, copied, folio);
+ if (old_size < pos)
+ pagecache_isize_extended(iter->inode, old_size, pos);
+
+ return written == copied;
}
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
@@ -907,7 +923,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
do {
struct folio *folio;
- loff_t old_size;
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
@@ -959,23 +974,6 @@ retry:
written = iomap_write_end(iter, pos, bytes, copied, folio) ?
copied : 0;
- /*
- * Update the in-memory inode size after copying the data into
- * the page cache. It's up to the file system to write the
- * updated size to disk, preferably after I/O completion so that
- * no stale data is exposed. Only once that's done can we
- * unlock and release the folio.
- */
- old_size = iter->inode->i_size;
- if (pos + written > old_size) {
- i_size_write(iter->inode, pos + written);
- iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
- }
- __iomap_put_folio(iter, pos, written, folio);
-
- if (old_size < pos)
- pagecache_isize_extended(iter->inode, old_size, pos);
-
cond_resched();
if (unlikely(written == 0)) {
/*
@@ -1346,7 +1344,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
bytes = folio_size(folio) - offset;
ret = iomap_write_end(iter, pos, bytes, bytes, folio);
- __iomap_put_folio(iter, pos, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
@@ -1412,7 +1409,6 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
folio_mark_accessed(folio);
ret = iomap_write_end(iter, pos, bytes, bytes, folio);
- __iomap_put_folio(iter, pos, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 0fb7afac298e..9987055293b3 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -557,9 +557,11 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
size_check:
if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
+ int size = min_t(int, EALIST_SIZE(ea_buf->xattr), ea_size);
+
printk(KERN_ERR "ea_get: invalid extended attribute\n");
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
- ea_buf->xattr, ea_size, 1);
+ ea_buf->xattr, size, 1);
ea_release(inode, ea_buf);
rc = -EIO;
goto clean_up;
diff --git a/fs/locks.c b/fs/locks.c
index 90c8746874de..c360d1992d21 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2448,8 +2448,9 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
error = do_lock_file_wait(filp, cmd, file_lock);
/*
- * Attempt to detect a close/fcntl race and recover by releasing the
- * lock that was just acquired. There is no need to do that when we're
+ * Detect close/fcntl races and recover by zapping all POSIX locks
+ * associated with this file and our files_struct, just like on
+ * filp_flush(). There is no need to do that when we're
* unlocking though, or for OFD locks.
*/
if (!error && file_lock->c.flc_type != F_UNLCK &&
@@ -2464,9 +2465,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
f = files_lookup_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (f != filp) {
- file_lock->c.flc_type = F_UNLCK;
- error = do_lock_file_wait(filp, cmd, file_lock);
- WARN_ON_ONCE(error);
+ locks_remove_posix(filp, files);
error = -EBADF;
}
}
diff --git a/fs/namei.c b/fs/namei.c
index 37fb0a8aa09a..1e05a0f3f04d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3572,8 +3572,12 @@ static const char *open_last_lookups(struct nameidata *nd,
else
inode_lock_shared(dir->d_inode);
dentry = lookup_open(nd, file, op, got_write);
- if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
- fsnotify_create(dir->d_inode, dentry);
+ if (!IS_ERR(dentry)) {
+ if (file->f_mode & FMODE_CREATED)
+ fsnotify_create(dir->d_inode, dentry);
+ if (file->f_mode & FMODE_OPENED)
+ fsnotify_open(file);
+ }
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
@@ -3700,6 +3704,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap,
mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
error = dir->i_op->tmpfile(idmap, dir, file, mode);
dput(child);
+ if (file->f_mode & FMODE_OPENED)
+ fsnotify_open(file);
if (error)
return error;
/* Don't check for other permissions, the inode was just created */
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 07bc1fd43530..d583af7a2209 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -523,6 +523,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
struct netfs_group *group;
struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file_inode(file);
struct netfs_inode *ictx = netfs_inode(inode);
vm_fault_t ret = VM_FAULT_RETRY;
@@ -534,6 +535,11 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
if (folio_lock_killable(folio) < 0)
goto out;
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
if (folio_wait_writeback_killable(folio)) {
ret = VM_FAULT_LOCKED;
@@ -549,9 +555,9 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
group = netfs_folio_group(folio);
if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
folio_unlock(folio);
- err = filemap_fdatawait_range(inode->i_mapping,
- folio_pos(folio),
- folio_pos(folio) + folio_size(folio));
+ err = filemap_fdatawrite_range(mapping,
+ folio_pos(folio),
+ folio_pos(folio) + folio_size(folio));
switch (err) {
case 0:
ret = VM_FAULT_RETRY;
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index e14cd53ac9fd..88f2adfab75e 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -92,8 +92,9 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
if (async)
wreq->iocb = iocb;
+ wreq->len = iov_iter_count(&wreq->io_iter);
wreq->cleanup = netfs_cleanup_dio_write;
- ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter));
+ ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
if (ret < 0) {
_debug("begin = %zd", ret);
goto out;
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 95e281a8af78..acd9ca14e264 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -63,15 +63,6 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
/*
* misc.c
*/
-#define NETFS_FLAG_PUT_MARK BIT(0)
-#define NETFS_FLAG_PAGECACHE_MARK BIT(1)
-int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
- struct folio *folio, unsigned int flags,
- gfp_t gfp_mask);
-int netfs_add_folios_to_buffer(struct xarray *buffer,
- struct address_space *mapping,
- pgoff_t index, pgoff_t to, gfp_t gfp_mask);
-void netfs_clear_buffer(struct xarray *buffer);
/*
* objects.c
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index bc1fc54fb724..83e644bd518f 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -8,87 +8,6 @@
#include <linux/swap.h>
#include "internal.h"
-/*
- * Attach a folio to the buffer and maybe set marks on it to say that we need
- * to put the folio later and twiddle the pagecache flags.
- */
-int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
- struct folio *folio, unsigned int flags,
- gfp_t gfp_mask)
-{
- XA_STATE_ORDER(xas, xa, index, folio_order(folio));
-
-retry:
- xas_lock(&xas);
- for (;;) {
- xas_store(&xas, folio);
- if (!xas_error(&xas))
- break;
- xas_unlock(&xas);
- if (!xas_nomem(&xas, gfp_mask))
- return xas_error(&xas);
- goto retry;
- }
-
- if (flags & NETFS_FLAG_PUT_MARK)
- xas_set_mark(&xas, NETFS_BUF_PUT_MARK);
- if (flags & NETFS_FLAG_PAGECACHE_MARK)
- xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK);
- xas_unlock(&xas);
- return xas_error(&xas);
-}
-
-/*
- * Create the specified range of folios in the buffer attached to the read
- * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that
- * these need freeing later.
- */
-int netfs_add_folios_to_buffer(struct xarray *buffer,
- struct address_space *mapping,
- pgoff_t index, pgoff_t to, gfp_t gfp_mask)
-{
- struct folio *folio;
- int ret;
-
- if (to + 1 == index) /* Page range is inclusive */
- return 0;
-
- do {
- /* TODO: Figure out what order folio can be allocated here */
- folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0);
- if (!folio)
- return -ENOMEM;
- folio->index = index;
- ret = netfs_xa_store_and_mark(buffer, index, folio,
- NETFS_FLAG_PUT_MARK, gfp_mask);
- if (ret < 0) {
- folio_put(folio);
- return ret;
- }
-
- index += folio_nr_pages(folio);
- } while (index <= to && index != 0);
-
- return 0;
-}
-
-/*
- * Clear an xarray buffer, putting a ref on the folios that have
- * NETFS_BUF_PUT_MARK set.
- */
-void netfs_clear_buffer(struct xarray *buffer)
-{
- struct folio *folio;
- XA_STATE(xas, buffer, 0);
-
- rcu_read_lock();
- xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) {
- folio_put(folio);
- }
- rcu_read_unlock();
- xa_destroy(buffer);
-}
-
/**
* netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
* @mapping: The mapping the folio belongs to.
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 3aa86e268f40..ec6cf8707fb0 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -483,7 +483,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
if (!debug)
kdebug("R=%x: No submit", wreq->debug_id);
- if (flen < fsize)
+ if (foff + flen < fsize)
for (int s = 0; s < NR_IO_STREAMS; s++)
netfs_issue_write(wreq, &wreq->io_streams[s]);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 342930996226..07a7be27182e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1627,7 +1627,16 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
switch (error) {
case 1:
break;
- case 0:
+ case -ETIMEDOUT:
+ if (inode && (IS_ROOT(dentry) ||
+ NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL))
+ error = 1;
+ break;
+ case -ESTALE:
+ case -ENOENT:
+ error = 0;
+ fallthrough;
+ default:
/*
* We can't d_drop the root of a disconnected tree:
* its d_hash is on the s_anon list and d_drop() would hide
@@ -1682,18 +1691,8 @@ static int nfs_lookup_revalidate_dentry(struct inode *dir,
dir_verifier = nfs_save_change_attribute(dir);
ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
- if (ret < 0) {
- switch (ret) {
- case -ESTALE:
- case -ENOENT:
- ret = 0;
- break;
- case -ETIMEDOUT:
- if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
- ret = 1;
- }
+ if (ret < 0)
goto out;
- }
/* Request help from readdirplus */
nfs_lookup_advise_force_readdirplus(dir, flags);
@@ -1737,7 +1736,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct inode *inode;
- int error;
+ int error = 0;
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = d_inode(dentry);
@@ -1782,7 +1781,7 @@ out_valid:
out_bad:
if (flags & LOOKUP_RCU)
return -ECHILD;
- return nfs_lookup_revalidate_done(dir, dentry, inode, 0);
+ return nfs_lookup_revalidate_done(dir, dentry, inode, error);
}
static int
@@ -1804,9 +1803,10 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
if (parent != READ_ONCE(dentry->d_parent))
return -ECHILD;
} else {
- /* Wait for unlink to complete */
+ /* Wait for unlink to complete - see unblock_revalidate() */
wait_var_event(&dentry->d_fsdata,
- dentry->d_fsdata != NFS_FSDATA_BLOCKED);
+ smp_load_acquire(&dentry->d_fsdata)
+ != NFS_FSDATA_BLOCKED);
parent = dget_parent(dentry);
ret = reval(d_inode(parent), dentry, flags);
dput(parent);
@@ -1819,6 +1819,29 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
}
+static void block_revalidate(struct dentry *dentry)
+{
+ /* old devname - just in case */
+ kfree(dentry->d_fsdata);
+
+ /* Any new reference that could lead to an open
+ * will take ->d_lock in lookup_open() -> d_lookup().
+ * Holding this lock ensures we cannot race with
+ * __nfs_lookup_revalidate() and removes and need
+ * for further barriers.
+ */
+ lockdep_assert_held(&dentry->d_lock);
+
+ dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+}
+
+static void unblock_revalidate(struct dentry *dentry)
+{
+ /* store_release ensures wait_var_event() sees the update */
+ smp_store_release(&dentry->d_fsdata, NULL);
+ wake_up_var(&dentry->d_fsdata);
+}
+
/*
* A weaker form of d_revalidate for revalidating just the d_inode(dentry)
* when we don't really care about the dentry name. This is called when a
@@ -2255,6 +2278,9 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
*/
int error = 0;
+ if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+ return -ENAMETOOLONG;
+
if (open_flags & O_CREAT) {
file->f_mode |= FMODE_CREATED;
error = nfs_do_create(dir, dentry, mode, open_flags);
@@ -2549,15 +2575,12 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
spin_unlock(&dentry->d_lock);
goto out;
}
- /* old devname */
- kfree(dentry->d_fsdata);
- dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+ block_revalidate(dentry);
spin_unlock(&dentry->d_lock);
error = nfs_safe_remove(dentry);
nfs_dentry_remove_handle_error(dir, dentry, error);
- dentry->d_fsdata = NULL;
- wake_up_var(&dentry->d_fsdata);
+ unblock_revalidate(dentry);
out:
trace_nfs_unlink_exit(dir, dentry, error);
return error;
@@ -2664,8 +2687,7 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
{
struct dentry *new_dentry = data->new_dentry;
- new_dentry->d_fsdata = NULL;
- wake_up_var(&new_dentry->d_fsdata);
+ unblock_revalidate(new_dentry);
}
/*
@@ -2727,11 +2749,6 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED))
goto out;
- if (new_dentry->d_fsdata) {
- /* old devname */
- kfree(new_dentry->d_fsdata);
- new_dentry->d_fsdata = NULL;
- }
spin_lock(&new_dentry->d_lock);
if (d_count(new_dentry) > 2) {
@@ -2753,7 +2770,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
new_dentry = dentry;
new_inode = NULL;
} else {
- new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+ block_revalidate(new_dentry);
must_unblock = true;
spin_unlock(&new_dentry->d_lock);
}
@@ -2765,6 +2782,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
must_unblock ? nfs_unblock_rename : NULL);
if (IS_ERR(task)) {
+ if (must_unblock)
+ unblock_revalidate(new_dentry);
error = PTR_ERR(task);
goto out;
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index bb2f583eb28b..90079ca134dd 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -141,8 +141,6 @@ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t ret;
- VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
-
if (iov_iter_rw(iter) == READ)
ret = nfs_file_direct_read(iocb, iter, true);
else
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c93c12063b3a..a691fa10b3e9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4023,6 +4023,23 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location,
}
}
+static bool _is_same_nfs4_pathname(struct nfs4_pathname *path1,
+ struct nfs4_pathname *path2)
+{
+ int i;
+
+ if (path1->ncomponents != path2->ncomponents)
+ return false;
+ for (i = 0; i < path1->ncomponents; i++) {
+ if (path1->components[i].len != path2->components[i].len)
+ return false;
+ if (memcmp(path1->components[i].data, path2->components[i].data,
+ path1->components[i].len))
+ return false;
+ }
+ return true;
+}
+
static int _nfs4_discover_trunking(struct nfs_server *server,
struct nfs_fh *fhandle)
{
@@ -4056,9 +4073,13 @@ static int _nfs4_discover_trunking(struct nfs_server *server,
if (status)
goto out_free_3;
- for (i = 0; i < locations->nlocations; i++)
+ for (i = 0; i < locations->nlocations; i++) {
+ if (!_is_same_nfs4_pathname(&locations->fs_path,
+ &locations->locations[i].rootpath))
+ continue;
test_fs_location_for_trunking(&locations->locations[i], clp,
server);
+ }
out_free_3:
kfree(locations->fattr);
out_free_2:
@@ -6268,6 +6289,7 @@ nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
if (status == 0)
nfs_setsecurity(inode, fattr);
+ nfs_free_fattr(fattr);
return status;
}
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 6efb5068c116..040b6b79c75e 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1545,6 +1545,11 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
continue;
} else if (index == prev->wb_index + 1)
continue;
+ /*
+ * We will submit more requests after these. Indicate
+ * this to the underlying layers.
+ */
+ desc->pg_moreio = 1;
nfs_pageio_complete(desc);
break;
}
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 0e27a2e4e68b..13818129d268 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -41,7 +41,7 @@ static int nfs_symlink_filler(struct file *file, struct folio *folio)
error:
folio_set_error(folio);
folio_unlock(folio);
- return -EIO;
+ return error;
}
static const char *nfs_get_link(struct dentry *dentry,
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
index 62d2586d9902..529a75ecf22e 100644
--- a/fs/nfsd/netlink.c
+++ b/fs/nfsd/netlink.c
@@ -44,9 +44,7 @@ static const struct nla_policy nfsd_listener_set_nl_policy[NFSD_A_SERVER_SOCK_AD
static const struct genl_split_ops nfsd_nl_ops[] = {
{
.cmd = NFSD_CMD_RPC_STATUS_GET,
- .start = nfsd_nl_rpc_status_get_start,
.dumpit = nfsd_nl_rpc_status_get_dumpit,
- .done = nfsd_nl_rpc_status_get_done,
.flags = GENL_CMD_CAP_DUMP,
},
{
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
index e3724637d64d..2e132ef328f8 100644
--- a/fs/nfsd/netlink.h
+++ b/fs/nfsd/netlink.h
@@ -15,9 +15,6 @@
extern const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1];
extern const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1];
-int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb);
-int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb);
-
int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb);
int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 202140df8f82..c848ebe5d08f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1460,28 +1460,6 @@ static int create_proc_exports_entry(void)
unsigned int nfsd_net_id;
-/**
- * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit
- * @cb: netlink metadata and command arguments
- *
- * Return values:
- * %0: The rpc_status_get command may proceed
- * %-ENODEV: There is no NFSD running in this namespace
- */
-int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb)
-{
- struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id);
- int ret = -ENODEV;
-
- mutex_lock(&nfsd_mutex);
- if (nn->nfsd_serv)
- ret = 0;
- else
- mutex_unlock(&nfsd_mutex);
-
- return ret;
-}
-
static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
struct netlink_callback *cb,
struct nfsd_genl_rqstp *rqstp)
@@ -1558,8 +1536,16 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
int i, ret, rqstp_index = 0;
+ struct nfsd_net *nn;
+
+ mutex_lock(&nfsd_mutex);
+
+ nn = net_generic(sock_net(skb->sk), nfsd_net_id);
+ if (!nn->nfsd_serv) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
rcu_read_lock();
@@ -1636,22 +1622,10 @@ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
ret = skb->len;
out:
rcu_read_unlock();
-
- return ret;
-}
-
-/**
- * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing
- * @cb: netlink metadata and command arguments
- *
- * Return values:
- * %0: Success
- */
-int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb)
-{
+out_unlock:
mutex_unlock(&nfsd_mutex);
- return 0;
+ return ret;
}
/**
@@ -2195,6 +2169,8 @@ static __net_init int nfsd_net_init(struct net *net)
nn->nfsd_svcstats.program = &nfsd_program;
nn->nfsd_versions = NULL;
nn->nfsd4_minorversions = NULL;
+ nn->nfsd_info.mutex = &nfsd_mutex;
+ nn->nfsd_serv = NULL;
nfsd4_init_leases_net(nn);
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cd9a6a1a9fc8..89d7918de7b1 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -672,7 +672,6 @@ int nfsd_create_serv(struct net *net)
return error;
}
spin_lock(&nfsd_notifier_lock);
- nn->nfsd_info.mutex = &nfsd_mutex;
nn->nfsd_serv = serv;
spin_unlock(&nfsd_notifier_lock);
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 89caef7513db..ba50388ee4bf 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -377,11 +377,12 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
* @target: offset number of an entry in the group (start point)
* @bsize: size in bits
* @lock: spin lock protecting @bitmap
+ * @wrap: whether to wrap around
*/
static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
unsigned int bsize,
- spinlock_t *lock)
+ spinlock_t *lock, bool wrap)
{
int pos, end = bsize;
@@ -397,6 +398,8 @@ static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
end = target;
}
+ if (!wrap)
+ return -ENOSPC;
/* wrap around */
for (pos = 0; pos < end; pos++) {
@@ -495,9 +498,10 @@ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
* nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
* @inode: inode of metadata file using this allocator
* @req: nilfs_palloc_req structure exchanged for the allocation
+ * @wrap: whether to wrap around
*/
int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
- struct nilfs_palloc_req *req)
+ struct nilfs_palloc_req *req, bool wrap)
{
struct buffer_head *desc_bh, *bitmap_bh;
struct nilfs_palloc_group_desc *desc;
@@ -516,7 +520,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
entries_per_group = nilfs_palloc_entries_per_group(inode);
for (i = 0; i < ngroups; i += n) {
- if (group >= ngroups) {
+ if (group >= ngroups && wrap) {
/* wrap around */
group = 0;
maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
@@ -550,7 +554,14 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
- bitmap, group_offset, entries_per_group, lock);
+ bitmap, group_offset, entries_per_group, lock,
+ wrap);
+ /*
+ * Since the search for a free slot in the second and
+ * subsequent bitmap blocks always starts from the
+ * beginning, the wrap flag only has an effect on the
+ * first search.
+ */
kunmap_local(bitmap_kaddr);
if (pos >= 0)
goto found;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index b667e869ac07..d825a9faca6d 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -50,8 +50,8 @@ struct nilfs_palloc_req {
struct buffer_head *pr_entry_bh;
};
-int nilfs_palloc_prepare_alloc_entry(struct inode *,
- struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+ struct nilfs_palloc_req *req, bool wrap);
void nilfs_palloc_commit_alloc_entry(struct inode *,
struct nilfs_palloc_req *);
void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 180fc8d36213..fc1caf63a42a 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -75,7 +75,7 @@ int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
{
int ret;
- ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+ ret = nilfs_palloc_prepare_alloc_entry(dat, req, true);
if (ret < 0)
return ret;
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index a002a44ff161..dddfa604491a 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -135,6 +135,9 @@ static bool nilfs_check_folio(struct folio *folio, char *kaddr)
goto Enamelen;
if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
goto Espan;
+ if (unlikely(p->inode &&
+ NILFS_PRIVATE_INODE(le64_to_cpu(p->inode))))
+ goto Einumber;
}
if (offs != limit)
goto Eend;
@@ -160,6 +163,9 @@ Enamelen:
goto bad_entry;
Espan:
error = "directory entry across blocks";
+ goto bad_entry;
+Einumber:
+ error = "disallowed inode number";
bad_entry:
nilfs_error(sb,
"bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d",
@@ -607,7 +613,7 @@ int nilfs_empty_dir(struct inode *inode)
kaddr = nilfs_get_folio(inode, i, &folio);
if (IS_ERR(kaddr))
- continue;
+ return 0;
de = (struct nilfs_dir_entry *)kaddr;
kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 612e609158b5..1e86b9303b7c 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -56,13 +56,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
struct nilfs_palloc_req req;
int ret;
- req.pr_entry_nr = 0; /*
- * 0 says find free inode from beginning
- * of a group. dull code!!
- */
+ req.pr_entry_nr = NILFS_FIRST_INO(ifile->i_sb);
req.pr_entry_bh = NULL;
- ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+ ret = nilfs_palloc_prepare_alloc_entry(ifile, &req, false);
if (!ret) {
ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
&req.pr_entry_bh);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 728e90be3570..4017f7856440 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -116,9 +116,15 @@ enum {
#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
#define NILFS_MDT_INODE(sb, ino) \
- ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino)))
+ ((ino) < NILFS_USER_INO && (NILFS_MDT_INO_BITS & BIT(ino)))
#define NILFS_VALID_INODE(sb, ino) \
- ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino)))
+ ((ino) >= NILFS_FIRST_INO(sb) || \
+ ((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino))))
+
+#define NILFS_PRIVATE_INODE(ino) ({ \
+ ino_t __ino = (ino); \
+ ((__ino) < NILFS_USER_INO && (__ino) != NILFS_ROOT_INO && \
+ (__ino) != NILFS_SKETCH_INO); })
/**
* struct nilfs_transaction_info: context information for synchronization
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 60d4f59f7665..6ea81f1d5094 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1652,6 +1652,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
if (bh->b_folio != bd_folio) {
if (bd_folio) {
folio_lock(bd_folio);
+ folio_wait_writeback(bd_folio);
folio_clear_dirty_for_io(bd_folio);
folio_start_writeback(bd_folio);
folio_unlock(bd_folio);
@@ -1665,6 +1666,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
if (bh == segbuf->sb_super_root) {
if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
+ folio_wait_writeback(bd_folio);
folio_clear_dirty_for_io(bd_folio);
folio_start_writeback(bd_folio);
folio_unlock(bd_folio);
@@ -1681,6 +1683,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
if (bd_folio) {
folio_lock(bd_folio);
+ folio_wait_writeback(bd_folio);
folio_clear_dirty_for_io(bd_folio);
folio_start_writeback(bd_folio);
folio_unlock(bd_folio);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index f41d7b6d432c..e44dde57ab65 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -452,6 +452,12 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
}
nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+ if (nilfs->ns_first_ino < NILFS_USER_INO) {
+ nilfs_err(nilfs->ns_sb,
+ "too small lower limit for non-reserved inode numbers: %u",
+ nilfs->ns_first_ino);
+ return -EINVAL;
+ }
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 85da0629415d..1e829ed7b0ef 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -182,7 +182,7 @@ struct the_nilfs {
unsigned long ns_nrsvsegs;
unsigned long ns_first_data_block;
int ns_inode_size;
- int ns_first_ino;
+ unsigned int ns_first_ino;
u32 ns_crc_seed;
/* /sys/fs/<nilfs>/<device> */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f0467d3b3c88..6be175a1ab3c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2366,6 +2366,11 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
}
list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+ ret = ocfs2_assure_trans_credits(handle, credits);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
ret = ocfs2_mark_extent_written(inode, &et, handle,
ue->ue_cpos, 1,
ue->ue_phys,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 604fea3a26ff..530fba34f6d3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -446,6 +446,23 @@ bail:
}
/*
+ * Make sure handle has at least 'nblocks' credits available. If it does not
+ * have that many credits available, we will try to extend the handle to have
+ * enough credits. If that fails, we will restart transaction to have enough
+ * credits. Similar notes regarding data consistency and locking implications
+ * as for ocfs2_extend_trans() apply here.
+ */
+int ocfs2_assure_trans_credits(handle_t *handle, int nblocks)
+{
+ int old_nblks = jbd2_handle_buffer_credits(handle);
+
+ trace_ocfs2_assure_trans_credits(old_nblks);
+ if (old_nblks >= nblocks)
+ return 0;
+ return ocfs2_extend_trans(handle, nblocks - old_nblks);
+}
+
+/*
* If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
* If that fails, restart the transaction & regain write access for the
* buffer head which is used for metadata modifications.
@@ -479,12 +496,6 @@ bail:
return status;
}
-
-struct ocfs2_triggers {
- struct jbd2_buffer_trigger_type ot_triggers;
- int ot_offset;
-};
-
static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
{
return container_of(triggers, struct ocfs2_triggers, ot_triggers);
@@ -548,85 +559,76 @@ static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh)
{
+ struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+
mlog(ML_ERROR,
"ocfs2_abort_trigger called by JBD2. bh = 0x%lx, "
"bh->b_blocknr = %llu\n",
(unsigned long)bh,
(unsigned long long)bh->b_blocknr);
- ocfs2_error(bh->b_assoc_map->host->i_sb,
+ ocfs2_error(ot->sb,
"JBD2 has aborted our journal, ocfs2 cannot continue\n");
}
-static struct ocfs2_triggers di_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_dinode, i_check),
-};
-
-static struct ocfs2_triggers eb_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
-};
-
-static struct ocfs2_triggers rb_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
-};
-
-static struct ocfs2_triggers gd_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
-};
-
-static struct ocfs2_triggers db_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_db_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
-};
+static void ocfs2_setup_csum_triggers(struct super_block *sb,
+ enum ocfs2_journal_trigger_type type,
+ struct ocfs2_triggers *ot)
+{
+ BUG_ON(type >= OCFS2_JOURNAL_TRIGGER_COUNT);
-static struct ocfs2_triggers xb_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
-};
+ switch (type) {
+ case OCFS2_JTR_DI:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_dinode, i_check);
+ break;
+ case OCFS2_JTR_EB:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_extent_block, h_check);
+ break;
+ case OCFS2_JTR_RB:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_refcount_block, rf_check);
+ break;
+ case OCFS2_JTR_GD:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_group_desc, bg_check);
+ break;
+ case OCFS2_JTR_DB:
+ ot->ot_triggers.t_frozen = ocfs2_db_frozen_trigger;
+ break;
+ case OCFS2_JTR_XB:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_xattr_block, xb_check);
+ break;
+ case OCFS2_JTR_DQ:
+ ot->ot_triggers.t_frozen = ocfs2_dq_frozen_trigger;
+ break;
+ case OCFS2_JTR_DR:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check);
+ break;
+ case OCFS2_JTR_DL:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check);
+ break;
+ case OCFS2_JTR_NONE:
+ /* To make compiler happy... */
+ return;
+ }
-static struct ocfs2_triggers dq_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_dq_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
-};
+ ot->ot_triggers.t_abort = ocfs2_abort_trigger;
+ ot->sb = sb;
+}
-static struct ocfs2_triggers dr_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
-};
+void ocfs2_initialize_journal_triggers(struct super_block *sb,
+ struct ocfs2_triggers triggers[])
+{
+ enum ocfs2_journal_trigger_type type;
-static struct ocfs2_triggers dl_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
-};
+ for (type = OCFS2_JTR_DI; type < OCFS2_JOURNAL_TRIGGER_COUNT; type++)
+ ocfs2_setup_csum_triggers(sb, type, &triggers[type]);
+}
static int __ocfs2_journal_access(handle_t *handle,
struct ocfs2_caching_info *ci,
@@ -708,56 +710,91 @@ static int __ocfs2_journal_access(handle_t *handle,
int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DI],
+ type);
}
int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_EB],
+ type);
}
int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_RB],
type);
}
int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_GD],
+ type);
}
int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DB],
+ type);
}
int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_XB],
+ type);
}
int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DQ],
+ type);
}
int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DR],
+ type);
}
int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DL],
+ type);
}
int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
@@ -778,13 +815,15 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
if (!is_handle_aborted(handle)) {
journal_t *journal = handle->h_transaction->t_journal;
- mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. "
- "Aborting transaction and journal.\n");
+ mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed: "
+ "handle type %u started at line %u, credits %u/%u "
+ "errcode %d. Aborting transaction and journal.\n",
+ handle->h_type, handle->h_line_no,
+ handle->h_requested_credits,
+ jbd2_handle_buffer_credits(handle), status);
handle->h_err = status;
jbd2_journal_abort_handle(handle);
jbd2_journal_abort(journal, status);
- ocfs2_abort(bh->b_assoc_map->host->i_sb,
- "Journal already aborted.\n");
}
}
}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 41c9fe7e62f9..e3c3a35dc5e0 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -243,6 +243,8 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb,
int ocfs2_commit_trans(struct ocfs2_super *osb,
handle_t *handle);
int ocfs2_extend_trans(handle_t *handle, int nblocks);
+int ocfs2_assure_trans_credits(handle_t *handle,
+ int nblocks);
int ocfs2_allocate_extend_trans(handle_t *handle,
int thresh);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a503c553bab2..8fe826143d7b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -284,6 +284,30 @@ enum ocfs2_mount_options
#define OCFS2_OSB_ERROR_FS 0x0004
#define OCFS2_DEFAULT_ATIME_QUANTUM 60
+struct ocfs2_triggers {
+ struct jbd2_buffer_trigger_type ot_triggers;
+ int ot_offset;
+ struct super_block *sb;
+};
+
+enum ocfs2_journal_trigger_type {
+ OCFS2_JTR_DI,
+ OCFS2_JTR_EB,
+ OCFS2_JTR_RB,
+ OCFS2_JTR_GD,
+ OCFS2_JTR_DB,
+ OCFS2_JTR_XB,
+ OCFS2_JTR_DQ,
+ OCFS2_JTR_DR,
+ OCFS2_JTR_DL,
+ OCFS2_JTR_NONE /* This must be the last entry */
+};
+
+#define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE
+
+void ocfs2_initialize_journal_triggers(struct super_block *sb,
+ struct ocfs2_triggers triggers[]);
+
struct ocfs2_journal;
struct ocfs2_slot_info;
struct ocfs2_recovery_map;
@@ -351,6 +375,9 @@ struct ocfs2_super
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
+ /* Journal triggers for checksum */
+ struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT];
+
struct delayed_work la_enable_wq;
/*
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 60e208b01c8d..0511c69c9fde 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2577,6 +2577,8 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits);
+
DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8aabaed2c1cb..afee70125ae3 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1075,9 +1075,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root,
osb, &ocfs2_osb_debug_fops);
- if (ocfs2_meta_ecc(osb))
+ if (ocfs2_meta_ecc(osb)) {
+ ocfs2_initialize_journal_triggers(sb, osb->s_journal_triggers);
ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats,
osb->osb_debug_root);
+ }
status = ocfs2_mount_volume(sb);
if (status < 0)
diff --git a/fs/open.c b/fs/open.c
index 89cafb572061..278b3edcda44 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -202,13 +202,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
return error;
}
-SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
+SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
{
return do_sys_ftruncate(fd, length, 1);
}
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
+COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length)
{
return do_sys_ftruncate(fd, length, 1);
}
@@ -1004,11 +1004,6 @@ static int do_dentry_open(struct file *f,
}
}
- /*
- * Once we return a file with FMODE_OPENED, __fput() will call
- * fsnotify_close(), so we need fsnotify_open() here for symmetry.
- */
- fsnotify_open(f);
return 0;
cleanup_all:
@@ -1085,8 +1080,19 @@ EXPORT_SYMBOL(file_path);
*/
int vfs_open(const struct path *path, struct file *file)
{
+ int ret;
+
file->f_path = *path;
- return do_dentry_open(file, NULL);
+ ret = do_dentry_open(file, NULL);
+ if (!ret) {
+ /*
+ * Once we return a file with FMODE_OPENED, __fput() will call
+ * fsnotify_close(), so we need fsnotify_open() here for
+ * symmetry.
+ */
+ fsnotify_open(file);
+ }
+ return ret;
}
struct file *dentry_open(const struct path *path, int flags,
@@ -1177,8 +1183,10 @@ struct file *kernel_file_open(const struct path *path, int flags,
error = do_dentry_open(f, NULL);
if (error) {
fput(f);
- f = ERR_PTR(error);
+ return ERR_PTR(error);
}
+
+ fsnotify_open(f);
return f;
}
EXPORT_SYMBOL_GPL(kernel_file_open);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 116f542442dd..ab65e98a1def 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1314,10 +1314,6 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry,
int flags = file->f_flags | OVL_OPEN_FLAGS;
int err;
- err = ovl_copy_up(dentry->d_parent);
- if (err)
- return err;
-
old_cred = ovl_override_creds(dentry->d_sb);
err = ovl_setup_cred_for_create(dentry, inode, mode, old_cred);
if (err)
@@ -1360,6 +1356,10 @@ static int ovl_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
if (!OVL_FS(dentry->d_sb)->tmpfile)
return -EOPNOTSUPP;
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ return err;
+
err = ovl_want_write(dentry);
if (err)
return err;
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 063409069f56..5868cb222955 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -181,6 +181,10 @@ static int ovl_check_encode_origin(struct dentry *dentry)
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
bool decodable = ofs->config.nfs_export;
+ /* No upper layer? */
+ if (!ovl_upper_mnt(ofs))
+ return 1;
+
/* Lower file handle for non-upper non-decodable */
if (!ovl_dentry_upper(dentry) && !decodable)
return 1;
@@ -209,7 +213,7 @@ static int ovl_check_encode_origin(struct dentry *dentry)
* ovl_connect_layer() will try to make origin's layer "connected" by
* copying up a "connectable" ancestor.
*/
- if (d_is_dir(dentry) && ovl_upper_mnt(ofs) && decodable)
+ if (d_is_dir(dentry) && decodable)
return ovl_connect_layer(dentry);
/* Lower file handle for indexed and non-upper dir/non-dir */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 18550c071d71..72a1acd03675 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3214,7 +3214,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
mm = get_task_mm(task);
if (mm) {
seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
- seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages);
+ seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm));
seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
mmput(mm);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f8d35f993fe5..71e5039d940d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -707,6 +707,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_X86_USER_SHADOW_STACK
[ilog2(VM_SHADOW_STACK)] = "ss",
#endif
+#ifdef CONFIG_64BIT
+ [ilog2(VM_SEALED)] = "sl",
+#endif
};
size_t i;
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index bb86fc0641d8..6397fdefd876 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -134,7 +134,7 @@ module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
module_param(enable_gcm_256, bool, 0644);
-MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: n/N/0");
+MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/0");
module_param(require_gcm_256, bool, 0644);
MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0");
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 73482734a8d8..557b68e99d0a 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1494,6 +1494,8 @@ struct cifs_aio_ctx {
struct cifs_io_request {
struct netfs_io_request rreq;
struct cifsFileInfo *cfile;
+ struct TCP_Server_Info *server;
+ pid_t pid;
};
/* asynchronous read support */
@@ -1504,7 +1506,6 @@ struct cifs_io_subrequest {
struct cifs_io_request *req;
};
ssize_t got_bytes;
- pid_t pid;
unsigned int xid;
int result;
bool have_xid;
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 25e9ab947c17..595c4b673707 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1345,8 +1345,8 @@ cifs_async_readv(struct cifs_io_subrequest *rdata)
if (rc)
return rc;
- smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid);
- smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
+ smb->hdr.Pid = cpu_to_le16((__u16)rdata->req->pid);
+ smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->req->pid >> 16));
smb->AndXCommand = 0xFF; /* none */
smb->Fid = rdata->req->cfile->fid.netfid;
@@ -1689,8 +1689,8 @@ cifs_async_writev(struct cifs_io_subrequest *wdata)
if (rc)
goto async_writev_out;
- smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid);
- smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16));
+ smb->hdr.Pid = cpu_to_le16((__u16)wdata->req->pid);
+ smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->req->pid >> 16));
smb->AndXCommand = 0xFF; /* none */
smb->Fid = wdata->req->cfile->fid.netfid;
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 9d5c2440abfc..1374635e89fa 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -134,17 +134,15 @@ fail:
static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
- struct TCP_Server_Info *server;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
+ struct TCP_Server_Info *server = req->server;
struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
size_t rsize = 0;
int rc;
rdata->xid = get_xid();
rdata->have_xid = true;
-
- server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
rdata->server = server;
if (cifs_sb->ctx->rsize == 0)
@@ -179,15 +177,8 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq)
struct netfs_io_request *rreq = subreq->rreq;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
- struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
- pid_t pid;
int rc = 0;
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- pid = req->cfile->pid;
- else
- pid = current->tgid; // Ummm... This may be a workqueue
-
cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n",
__func__, rreq->debug_id, subreq->debug_index, rreq->mapping,
subreq->transferred, subreq->len);
@@ -201,16 +192,8 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq)
}
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
- rdata->pid = pid;
-
- rc = adjust_credits(rdata->server, &rdata->credits, rdata->subreq.len);
- if (!rc) {
- if (rdata->req->cfile->invalidHandle)
- rc = -EAGAIN;
- else
- rc = rdata->server->ops->async_readv(rdata);
- }
+ rc = rdata->server->ops->async_readv(rdata);
out:
if (rc)
netfs_subreq_terminated(subreq, rc, false);
@@ -245,11 +228,15 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file)
rreq->rsize = cifs_sb->ctx->rsize;
rreq->wsize = cifs_sb->ctx->wsize;
+ req->pid = current->tgid; // Ummm... This may be a workqueue
if (file) {
open_file = file->private_data;
rreq->netfs_priv = file->private_data;
req->cfile = cifsFileInfo_get(open_file);
+ req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+ req->pid = req->cfile->pid;
} else if (rreq->origin != NETFS_WRITEBACK) {
WARN_ON_ONCE(1);
return -EIO;
@@ -259,35 +246,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file)
}
/*
- * Expand the size of a readahead to the size of the rsize, if at least as
- * large as a page, allowing for the possibility that rsize is not pow-2
- * aligned.
- */
-static void cifs_expand_readahead(struct netfs_io_request *rreq)
-{
- unsigned int rsize = rreq->rsize;
- loff_t misalignment, i_size = i_size_read(rreq->inode);
-
- if (rsize < PAGE_SIZE)
- return;
-
- if (rsize < INT_MAX)
- rsize = roundup_pow_of_two(rsize);
- else
- rsize = ((unsigned int)INT_MAX + 1) / 2;
-
- misalignment = rreq->start & (rsize - 1);
- if (misalignment) {
- rreq->start -= misalignment;
- rreq->len += misalignment;
- }
-
- rreq->len = round_up(rreq->len, rsize);
- if (rreq->start < i_size && rreq->len > i_size - rreq->start)
- rreq->len = i_size - rreq->start;
-}
-
-/*
* Completion of a request operation.
*/
static void cifs_rreq_done(struct netfs_io_request *rreq)
@@ -342,7 +300,6 @@ const struct netfs_request_ops cifs_req_ops = {
.init_request = cifs_init_request,
.free_request = cifs_free_request,
.free_subrequest = cifs_free_subrequest,
- .expand_readahead = cifs_expand_readahead,
.clamp_length = cifs_clamp_length,
.issue_read = cifs_req_issue_read,
.done = cifs_rreq_done,
@@ -3200,8 +3157,6 @@ static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t ret;
- WARN_ON_ONCE(iov_iter_count(iter) != PAGE_SIZE);
-
if (iov_iter_rw(iter) == READ)
ret = netfs_unbuffered_read_iter_locked(iocb, iter);
else
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 993ac36c3d58..2ae2dbb6202b 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4484,6 +4484,16 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
return rc;
}
+static void smb2_readv_worker(struct work_struct *work)
+{
+ struct cifs_io_subrequest *rdata =
+ container_of(work, struct cifs_io_subrequest, subreq.work);
+
+ netfs_subreq_terminated(&rdata->subreq,
+ (rdata->result == 0 || rdata->result == -EAGAIN) ?
+ rdata->got_bytes : rdata->result, true);
+}
+
static void
smb2_readv_callback(struct mid_q_entry *mid)
{
@@ -4577,12 +4587,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
if (rdata->subreq.start < rdata->subreq.rreq->i_size)
rdata->result = 0;
}
- if (rdata->result == 0 || rdata->result == -EAGAIN)
- iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes);
rdata->credits.value = 0;
- netfs_subreq_terminated(&rdata->subreq,
- (rdata->result == 0 || rdata->result == -EAGAIN) ?
- rdata->got_bytes : rdata->result, true);
+ INIT_WORK(&rdata->subreq.work, smb2_readv_worker);
+ queue_work(cifsiod_wq, &rdata->subreq.work);
release_mid(mid);
add_credits(server, &credits, 0);
}
@@ -4614,7 +4621,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
io_parms.length = rdata->subreq.len;
io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid;
io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid;
- io_parms.pid = rdata->pid;
+ io_parms.pid = rdata->req->pid;
rc = smb2_new_read_req(
(void **) &buf, &total_len, &io_parms, rdata, 0, 0);
@@ -4789,7 +4796,6 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -ENOSPC;
else
wdata->subreq.len = written;
- iov_iter_advance(&wdata->subreq.io_iter, written);
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
@@ -4867,7 +4873,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
.length = wdata->subreq.len,
.persistent_fid = wdata->req->cfile->fid.persistent_fid,
.volatile_fid = wdata->req->cfile->fid.volatile_fid,
- .pid = wdata->pid,
+ .pid = wdata->req->pid,
};
io_parms = &_io_parms;
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 02135a605305..1476c445cadc 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -216,8 +216,8 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid)
}
tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid);
if (!tcon) {
- cifs_put_smb_ses(ses);
spin_unlock(&cifs_tcp_ses_lock);
+ cifs_put_smb_ses(ses);
return NULL;
}
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index b6c5a8ea3887..e7e07891781b 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -630,6 +630,12 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls)
return name;
}
+ if (*name == '\\') {
+ pr_err("not allow directory name included leading slash\n");
+ kfree(name);
+ return ERR_PTR(-EINVAL);
+ }
+
ksmbd_conv_path_to_unix(name);
ksmbd_strip_last_slash(name);
return name;
@@ -2361,7 +2367,8 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
if (rc > 0) {
rc = ksmbd_vfs_remove_xattr(idmap,
path,
- attr_name);
+ attr_name,
+ get_write);
if (rc < 0) {
ksmbd_debug(SMB,
@@ -2376,7 +2383,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
} else {
rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value,
le16_to_cpu(eabuf->EaValueLength),
- 0, true);
+ 0, get_write);
if (rc < 0) {
ksmbd_debug(SMB,
"ksmbd_vfs_setxattr is failed(%d)\n",
@@ -2468,7 +2475,7 @@ static int smb2_remove_smb_xattrs(const struct path *path)
!strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX,
STREAM_PREFIX_LEN)) {
err = ksmbd_vfs_remove_xattr(idmap, path,
- name);
+ name, true);
if (err)
ksmbd_debug(SMB, "remove xattr failed : %s\n",
name);
@@ -2842,20 +2849,11 @@ int smb2_open(struct ksmbd_work *work)
}
if (req->NameLength) {
- if ((req->CreateOptions & FILE_DIRECTORY_FILE_LE) &&
- *(char *)req->Buffer == '\\') {
- pr_err("not allow directory name included leading slash\n");
- rc = -EINVAL;
- goto err_out2;
- }
-
name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset),
le16_to_cpu(req->NameLength),
work->conn->local_nls);
if (IS_ERR(name)) {
rc = PTR_ERR(name);
- if (rc != -ENOMEM)
- rc = -ENOENT;
name = NULL;
goto err_out2;
}
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 51b1b0bed616..9e859ba010cf 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -1058,16 +1058,21 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
}
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
- const struct path *path, char *attr_name)
+ const struct path *path, char *attr_name,
+ bool get_write)
{
int err;
- err = mnt_want_write(path->mnt);
- if (err)
- return err;
+ if (get_write == true) {
+ err = mnt_want_write(path->mnt);
+ if (err)
+ return err;
+ }
err = vfs_removexattr(idmap, path->dentry, attr_name);
- mnt_drop_write(path->mnt);
+
+ if (get_write == true)
+ mnt_drop_write(path->mnt);
return err;
}
@@ -1380,7 +1385,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path)
ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name));
if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) {
- err = ksmbd_vfs_remove_xattr(idmap, path, name);
+ err = ksmbd_vfs_remove_xattr(idmap, path, name, true);
if (err)
ksmbd_debug(SMB, "remove xattr failed : %s\n", name);
}
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index cfe1c8092f23..cb76f4b5bafe 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -114,7 +114,8 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
size_t *xattr_stream_name_size, int s_type);
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
- const struct path *path, char *attr_name);
+ const struct path *path, char *attr_name,
+ bool get_write);
int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
unsigned int flags, struct path *parent_path,
struct path *path, bool caseless);
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 6cb599cd287e..8b2e37c8716e 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -254,7 +254,8 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp)
ci->m_flags &= ~S_DEL_ON_CLS_STREAM;
err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp),
&filp->f_path,
- fp->stream.name);
+ fp->stream.name,
+ true);
if (err)
pr_err("remove xattr failed : %s\n",
fp->stream.name);
diff --git a/fs/super.c b/fs/super.c
index b72f1d288e95..095ba793e10c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1502,8 +1502,17 @@ static int fs_bdev_thaw(struct block_device *bdev)
lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
+ /*
+ * The block device may have been frozen before it was claimed by a
+ * filesystem. Concurrently another process might try to mount that
+ * frozen block device and has temporarily claimed the block device for
+ * that purpose causing a concurrent fs_bdev_thaw() to end up here. The
+ * mounter is already about to abort mounting because they still saw an
+ * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return
+ * NULL in that case.
+ */
sb = get_bdev_super(bdev);
- if (WARN_ON_ONCE(!sb))
+ if (!sb)
return -EINVAL;
if (sb->s_op->thaw_super)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c101cf266bc4..6af6f744fdd6 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4058,20 +4058,32 @@ xfs_bmapi_reserve_delalloc(
xfs_extlen_t indlen;
uint64_t fdblocks;
int error;
- xfs_fileoff_t aoff = off;
+ xfs_fileoff_t aoff;
+ bool use_cowextszhint =
+ whichfork == XFS_COW_FORK && !prealloc;
+retry:
/*
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
+ aoff = off;
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
prealloc = alen - len;
- /* Figure out the extent size, adjust alen */
- if (whichfork == XFS_COW_FORK) {
+ /*
+ * If we're targetting the COW fork but aren't creating a speculative
+ * posteof preallocation, try to expand the reservation to align with
+ * the COW extent size hint if there's sufficient free space.
+ *
+ * Unlike the data fork, the CoW cancellation functions will free all
+ * the reservations at inactivation, so we don't require that every
+ * delalloc reservation have a dirty pagecache.
+ */
+ if (use_cowextszhint) {
struct xfs_bmbt_irec prev;
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
@@ -4090,7 +4102,7 @@ xfs_bmapi_reserve_delalloc(
*/
error = xfs_quota_reserve_blkres(ip, alen);
if (error)
- return error;
+ goto out;
/*
* Split changing sb for alen and indlen since they could be coming
@@ -4140,6 +4152,17 @@ out_unreserve_frextents:
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen);
+out:
+ if (error == -ENOSPC || error == -EDQUOT) {
+ trace_xfs_delalloc_enospc(ip, off, len);
+
+ if (prealloc || use_cowextszhint) {
+ /* retry without any preallocation */
+ use_cowextszhint = false;
+ prealloc = 0;
+ goto retry;
+ }
+ }
return error;
}
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 97996cb79aaa..454b63ef7201 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -996,7 +996,7 @@ struct xfs_getparents_by_handle {
#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
-#define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exchange_range)
+#define XFS_IOC_EXCHANGE_RANGE _IOW ('X', 129, struct xfs_exchange_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index e7a7bfbe75b4..513b50da6215 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -379,10 +379,13 @@ xfs_dinode_verify_fork(
/*
* A directory small enough to fit in the inode must be stored
* in local format. The directory sf <-> extents conversion
- * code updates the directory size accordingly.
+ * code updates the directory size accordingly. Directories
+ * being truncated have zero size and are not subject to this
+ * check.
*/
if (S_ISDIR(mode)) {
- if (be64_to_cpu(dip->di_size) <= fork_size &&
+ if (dip->di_size &&
+ be64_to_cpu(dip->di_size) <= fork_size &&
fork_format != XFS_DINODE_FMT_LOCAL)
return __this_address;
}
@@ -528,9 +531,19 @@ xfs_dinode_verify(
if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN)
return __this_address;
- /* No zero-length symlinks/dirs. */
- if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
- return __this_address;
+ /*
+ * No zero-length symlinks/dirs unless they're unlinked and hence being
+ * inactivated.
+ */
+ if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) {
+ if (dip->di_version > 1) {
+ if (dip->di_nlink)
+ return __this_address;
+ } else {
+ if (dip->di_onlink)
+ return __this_address;
+ }
+ }
fa = xfs_dinode_verify_nrext64(mp, dip);
if (fa)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 09e4bf949bf8..6b56f0f6d4c1 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1038,11 +1038,12 @@ xfs_log_sb(
* and hence we don't need have to update it here.
*/
if (xfs_has_lazysbcount(mp)) {
- mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+ mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount);
mp->m_sb.sb_ifree = min_t(uint64_t,
- percpu_counter_sum(&mp->m_ifree),
+ percpu_counter_sum_positive(&mp->m_ifree),
mp->m_sb.sb_icount);
- mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ mp->m_sb.sb_fdblocks =
+ percpu_counter_sum_positive(&mp->m_fdblocks);
}
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index ac2e77ebb54c..a4d9fbc21b83 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -486,13 +486,11 @@ out_unlock:
/*
* Test whether it is appropriate to check an inode for and free post EOF
- * blocks. The 'force' parameter determines whether we should also consider
- * regular files that are marked preallocated or append-only.
+ * blocks.
*/
bool
xfs_can_free_eofblocks(
- struct xfs_inode *ip,
- bool force)
+ struct xfs_inode *ip)
{
struct xfs_bmbt_irec imap;
struct xfs_mount *mp = ip->i_mount;
@@ -526,11 +524,11 @@ xfs_can_free_eofblocks(
return false;
/*
- * Do not free real preallocated or append-only files unless the file
- * has delalloc blocks and we are forced to remove them.
+ * Only free real extents for inodes with persistent preallocations or
+ * the append-only flag.
*/
if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
- if (!force || ip->i_delayed_blks == 0)
+ if (ip->i_delayed_blks == 0)
return false;
/*
@@ -584,6 +582,22 @@ xfs_free_eofblocks(
/* Wait on dio to ensure i_size has settled. */
inode_dio_wait(VFS_I(ip));
+ /*
+ * For preallocated files only free delayed allocations.
+ *
+ * Note that this means we also leave speculative preallocations in
+ * place for preallocated files.
+ */
+ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) {
+ if (ip->i_delayed_blks) {
+ xfs_bmap_punch_delalloc_range(ip,
+ round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
+ LLONG_MAX);
+ }
+ xfs_inode_clear_eofblocks_tag(ip);
+ return 0;
+ }
+
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) {
ASSERT(xfs_is_shutdown(mp));
@@ -891,7 +905,7 @@ xfs_prepare_shift(
* Trim eofblocks to avoid shifting uninitialized post-eof preallocation
* into the accessible region of the file.
*/
- if (xfs_can_free_eofblocks(ip, true)) {
+ if (xfs_can_free_eofblocks(ip)) {
error = xfs_free_eofblocks(ip);
if (error)
return error;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 51f84d8ff372..eb0895bfb9da 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -63,7 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
/* EOF block manipulation functions */
-bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+bool xfs_can_free_eofblocks(struct xfs_inode *ip);
int xfs_free_eofblocks(struct xfs_inode *ip);
int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0953163a2d84..9967334ea99f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1155,7 +1155,7 @@ xfs_inode_free_eofblocks(
}
*lockflags |= XFS_IOLOCK_EXCL;
- if (xfs_can_free_eofblocks(ip, false))
+ if (xfs_can_free_eofblocks(ip))
return xfs_free_eofblocks(ip);
/* inode could be preallocated or append-only */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 58fb7a5062e1..a4e3cd8971fc 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -42,6 +42,7 @@
#include "xfs_pnfs.h"
#include "xfs_parent.h"
#include "xfs_xattr.h"
+#include "xfs_sb.h"
struct kmem_cache *xfs_inode_cache;
@@ -870,9 +871,16 @@ xfs_init_new_inode(
* this saves us from needing to run a separate transaction to set the
* fork offset in the immediate future.
*/
- if (init_xattrs && xfs_has_attr(mp)) {
+ if (init_xattrs) {
ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
+
+ if (!xfs_has_attr(mp)) {
+ spin_lock(&mp->m_sb_lock);
+ xfs_add_attr(mp);
+ spin_unlock(&mp->m_sb_lock);
+ xfs_log_sb(tp);
+ }
}
/*
@@ -1595,7 +1603,7 @@ xfs_release(
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
return 0;
- if (xfs_can_free_eofblocks(ip, false)) {
+ if (xfs_can_free_eofblocks(ip)) {
/*
* Check if the inode is being opened, written and closed
* frequently and we have delayed allocation blocks outstanding
@@ -1856,15 +1864,13 @@ xfs_inode_needs_inactive(
/*
* This file isn't being freed, so check if there are post-eof blocks
- * to free. @force is true because we are evicting an inode from the
- * cache. Post-eof blocks must be freed, lest we end up with broken
- * free space accounting.
+ * to free.
*
* Note: don't bother with iolock here since lockdep complains about
* acquiring it in reclaim context. We have the only reference to the
* inode at this point anyways.
*/
- return xfs_can_free_eofblocks(ip, true);
+ return xfs_can_free_eofblocks(ip);
}
/*
@@ -1947,15 +1953,11 @@ xfs_inactive(
if (VFS_I(ip)->i_nlink != 0) {
/*
- * force is true because we are evicting an inode from the
- * cache. Post-eof blocks must be freed, lest we end up with
- * broken free space accounting.
- *
* Note: don't bother with iolock here since lockdep complains
* about acquiring it in reclaim context. We have the only
* reference to the inode at this point anyways.
*/
- if (xfs_can_free_eofblocks(ip, true))
+ if (xfs_can_free_eofblocks(ip))
error = xfs_free_eofblocks(ip);
goto out;
@@ -2548,11 +2550,26 @@ xfs_ifree_cluster(
* This buffer may not have been correctly initialised as we
* didn't read it from disk. That's not important because we are
* only using to mark the buffer as stale in the log, and to
- * attach stale cached inodes on it. That means it will never be
- * dispatched for IO. If it is, we want to know about it, and we
- * want it to fail. We can acheive this by adding a write
- * verifier to the buffer.
+ * attach stale cached inodes on it.
+ *
+ * For the inode that triggered the cluster freeing, this
+ * attachment may occur in xfs_inode_item_precommit() after we
+ * have marked this buffer stale. If this buffer was not in
+ * memory before xfs_ifree_cluster() started, it will not be
+ * marked XBF_DONE and this will cause problems later in
+ * xfs_inode_item_precommit() when we trip over a (stale, !done)
+ * buffer to attached to the transaction.
+ *
+ * Hence we have to mark the buffer as XFS_DONE here. This is
+ * safe because we are also marking the buffer as XBF_STALE and
+ * XFS_BLI_STALE. That means it will never be dispatched for
+ * IO and it won't be unlocked until the cluster freeing has
+ * been committed to the journal and the buffer unpinned. If it
+ * is written, we want to know about it, and we want it to
+ * fail. We can acheive this by adding a write verifier to the
+ * buffer.
*/
+ bp->b_flags |= XBF_DONE;
bp->b_ops = &xfs_inode_buf_ops;
/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 378342673925..414903885ab9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1148,33 +1148,23 @@ xfs_buffered_write_iomap_begin(
}
}
-retry:
- error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks,
- allocfork == XFS_DATA_FORK ? &imap : &cmap,
- allocfork == XFS_DATA_FORK ? &icur : &ccur,
- allocfork == XFS_DATA_FORK ? eof : cow_eof);
- switch (error) {
- case 0:
- break;
- case -ENOSPC:
- case -EDQUOT:
- /* retry without any preallocation */
- trace_xfs_delalloc_enospc(ip, offset, count);
- if (prealloc_blocks) {
- prealloc_blocks = 0;
- goto retry;
- }
- fallthrough;
- default:
- goto out_unlock;
- }
-
if (allocfork == XFS_COW_FORK) {
+ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks, &cmap,
+ &ccur, cow_eof);
+ if (error)
+ goto out_unlock;
+
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
goto found_cow;
}
+ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks, &imap, &icur,
+ eof);
+ if (error)
+ goto out_unlock;
+
/*
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.