summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/backpointers.c10
-rw-r--r--fs/bcachefs/bcachefs.h2
-rw-r--r--fs/bcachefs/btree_iter.c2
-rw-r--r--fs/bcachefs/btree_key_cache.c37
-rw-r--r--fs/bcachefs/btree_key_cache_types.h34
-rw-r--r--fs/bcachefs/btree_trans_commit.c169
-rw-r--r--fs/bcachefs/btree_types.h35
-rw-r--r--fs/bcachefs/btree_update_interior.c30
-rw-r--r--fs/bcachefs/btree_update_interior.h1
-rw-r--r--fs/bcachefs/data_update.c28
-rw-r--r--fs/bcachefs/disk_groups.c4
-rw-r--r--fs/bcachefs/ec.c16
-rw-r--r--fs/bcachefs/fs-io-pagecache.c2
-rw-r--r--fs/bcachefs/fs-io-pagecache.h2
-rw-r--r--fs/bcachefs/fs.c8
-rw-r--r--fs/bcachefs/fsck.c2
-rw-r--r--fs/bcachefs/inode.c8
-rw-r--r--fs/bcachefs/io_write.c2
-rw-r--r--fs/bcachefs/journal.c31
-rw-r--r--fs/bcachefs/journal.h98
-rw-r--r--fs/bcachefs/journal_io.c7
-rw-r--r--fs/bcachefs/journal_reclaim.c42
-rw-r--r--fs/bcachefs/journal_types.h26
-rw-r--r--fs/bcachefs/six.c7
-rw-r--r--fs/bcachefs/subvolume_types.h2
-rw-r--r--fs/bcachefs/trace.h11
-rw-r--r--fs/bcachefs/xattr.c9
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/delayed-ref.c4
-rw-r--r--fs/btrfs/extent-tree.c25
-rw-r--r--fs/btrfs/extent-tree.h3
-rw-r--r--fs/btrfs/inode.c7
-rw-r--r--fs/btrfs/ioctl.c10
-rw-r--r--fs/btrfs/qgroup.c10
-rw-r--r--fs/btrfs/raid-stripe-tree.c2
-rw-r--r--fs/btrfs/scrub.c10
-rw-r--r--fs/btrfs/volumes.c6
-rw-r--r--fs/btrfs/zoned.c7
-rw-r--r--fs/nfsd/cache.h4
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nfsd/nfscache.c87
-rw-r--r--fs/nfsd/nfssvc.c14
-rw-r--r--fs/overlayfs/params.c11
-rw-r--r--fs/overlayfs/util.c2
-rw-r--r--fs/smb/client/cifs_spnego.c4
-rw-r--r--fs/smb/client/connect.c6
-rw-r--r--fs/smb/client/sess.c22
-rw-r--r--fs/smb/client/smb2transport.c5
-rw-r--r--fs/xfs/Kconfig2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c27
-rw-r--r--fs/xfs/libxfs/xfs_defer.c28
-rw-r--r--fs/xfs/libxfs/xfs_defer.h2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c3
-rw-r--r--fs/xfs/xfs_inode_item_recover.c46
-rw-r--r--fs/xfs/xfs_log.c23
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_reflink.c1
57 files changed, 487 insertions, 515 deletions
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index ef02c9bb0354..23c0834a97a4 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -313,17 +313,17 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
bp.level - 1,
0);
b = bch2_btree_iter_peek_node(iter);
- if (IS_ERR(b))
+ if (IS_ERR_OR_NULL(b))
goto err;
BUG_ON(b->c.level != bp.level - 1);
- if (b && extent_matches_bp(c, bp.btree_id, bp.level,
- bkey_i_to_s_c(&b->key),
- bucket, bp))
+ if (extent_matches_bp(c, bp.btree_id, bp.level,
+ bkey_i_to_s_c(&b->key),
+ bucket, bp))
return b;
- if (b && btree_node_will_make_reachable(b)) {
+ if (btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else {
backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9cb8684959ee..403aa3389fcc 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -617,7 +617,7 @@ struct journal_seq_blacklist_table {
u64 start;
u64 end;
bool dirty;
- } entries[0];
+ } entries[];
};
struct journal_keys {
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c2adf3fbb0b3..6fa90bcd7016 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3087,8 +3087,6 @@ void bch2_trans_put(struct btree_trans *trans)
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
}
- bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
kfree(trans->extra_journal_entries.data);
if (trans->fs_usage_deltas) {
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 9b78f78a75b5..37fbf22de8fc 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -89,10 +89,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
- if (ck->c.lock.readers)
+ if (ck->c.lock.readers) {
list_move_tail(&ck->list, &bc->freed_pcpu);
- else
+ bc->nr_freed_pcpu++;
+ } else {
list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ bc->nr_freed_nonpcpu++;
+ }
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
@@ -109,6 +112,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
{
struct bkey_cached *pos;
+ bc->nr_freed_nonpcpu++;
+
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
pos->btree_trans_barrier_seq)) {
@@ -158,6 +163,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
#else
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ bc->nr_freed_nonpcpu++;
mutex_unlock(&bc->lock);
#endif
} else {
@@ -217,6 +223,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_nonpcpu--;
f->objs[f->nr++] = ck;
}
@@ -229,6 +236,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_nonpcpu)) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_nonpcpu--;
}
mutex_unlock(&bc->lock);
#endif
@@ -664,7 +672,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
goto out;
bch2_journal_pin_drop(j, &ck->journal);
- bch2_journal_preres_put(j, &ck->res);
BUG_ON(!btree_node_locked(c_iter.path, 0));
@@ -762,18 +769,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
BUG_ON(insert->k.u64s > ck->u64s);
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- int difference;
-
- BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
-
- difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
- if (difference > 0) {
- trans->journal_preres.u64s -= difference;
- ck->res.u64s += difference;
- }
- }
-
bkey_copy(ck->k, insert);
ck->valid = true;
@@ -850,6 +845,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
+ scanned += bc->nr_freed_nonpcpu;
+
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -859,13 +856,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- scanned++;
freed++;
+ bc->nr_freed_nonpcpu--;
}
if (scanned >= nr)
goto out;
+ scanned += bc->nr_freed_pcpu;
+
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -875,8 +874,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- scanned++;
freed++;
+ bc->nr_freed_pcpu--;
}
if (scanned >= nr)
@@ -982,6 +981,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
}
#endif
+ BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
+ BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
+
list_splice(&bc->freed_pcpu, &items);
list_splice(&bc->freed_nonpcpu, &items);
@@ -991,7 +993,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal);
- bch2_journal_preres_put(&c->journal, &ck->res);
list_del(&ck->list);
kfree(ck->k);
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
new file mode 100644
index 000000000000..290e4e57df5b
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+
+struct btree_key_cache_freelist {
+ struct bkey_cached *objs[16];
+ unsigned nr;
+};
+
+struct btree_key_cache {
+ struct mutex lock;
+ struct rhashtable table;
+ bool table_init_done;
+
+ struct list_head freed_pcpu;
+ size_t nr_freed_pcpu;
+ struct list_head freed_nonpcpu;
+ size_t nr_freed_nonpcpu;
+
+ struct shrinker *shrink;
+ unsigned shrink_iter;
+ struct btree_key_cache_freelist __percpu *pcpu_freed;
+
+ atomic_long_t nr_freed;
+ atomic_long_t nr_keys;
+ atomic_long_t nr_dirty;
+};
+
+struct bkey_cached_key {
+ u32 btree_id;
+ struct bpos pos;
+} __packed __aligned(4);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index decad7b66c59..12907beda98c 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
bch2_btree_init_next(trans, b);
}
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+ while (--i >= trans->updates) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
+
+ bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+ }
+
+ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int bch2_trans_lock_write(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ EBUG_ON(trans->write_locked);
+
+ trans_for_each_update(trans, i) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
+
+ if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+ return trans_lock_write_fail(trans, i);
+
+ if (!i->cached)
+ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+ }
+
+ trans->write_locked = true;
+ return 0;
+}
+
+static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+ if (likely(trans->write_locked)) {
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ if (!same_leaf_as_prev(trans, i))
+ bch2_btree_node_unlock_write_inlined(trans, i->path,
+ insert_l(i)->b);
+ trans->write_locked = false;
+ }
+}
+
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
@@ -276,17 +323,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
}
-static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
- unsigned long trace_ip)
-{
- return drop_locks_do(trans,
- bch2_journal_preres_get(&trans->c->journal,
- &trans->journal_preres,
- trans->journal_preres_u64s,
- (flags & BCH_WATERMARK_MASK)));
-}
-
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
unsigned flags)
{
@@ -321,6 +357,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
return 0;
}
+noinline static int
+btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
+ struct btree_path *path, unsigned new_u64s)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
+ struct bkey_cached *ck = (void *) path->l[0].b;
+ struct bkey_i *new_k;
+ int ret;
+
+ bch2_trans_unlock_write(trans);
+ bch2_trans_unlock(trans);
+
+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+ if (!new_k) {
+ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_id_str(path->btree_id), new_u64s);
+ return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+ }
+
+ ret = bch2_trans_relock(trans) ?:
+ bch2_trans_lock_write(trans);
+ if (unlikely(ret)) {
+ kfree(new_k);
+ return ret;
+ }
+
+ memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
+
+ trans_for_each_update(trans, i)
+ if (i->old_v == &ck->k->v)
+ i->old_v = &new_k->v;
+
+ kfree(ck->k);
+ ck->u64s = new_u64s;
+ ck->k = new_k;
+ return 0;
+}
+
static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned u64s)
{
@@ -347,12 +422,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
- if (!new_k) {
- bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_id_str(path->btree_id), new_u64s);
- return -BCH_ERR_ENOMEM_btree_key_cache_insert;
- }
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+ if (unlikely(!new_k))
+ return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
trans_for_each_update(trans, i)
if (i->old_v == &ck->k->v)
@@ -732,37 +804,6 @@ revert_fs_usage:
return ret;
}
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
- while (--i >= trans->updates) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
- }
-
- trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int trans_lock_write(struct btree_trans *trans)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_update(trans, i) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
- return trans_lock_write_fail(trans, i);
-
- if (!i->cached)
- bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
- }
-
- return 0;
-}
-
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
struct btree_insert_entry *i;
@@ -830,15 +871,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
}
}
- ret = bch2_journal_preres_get(&c->journal,
- &trans->journal_preres, trans->journal_preres_u64s,
- (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
- if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
- ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
- if (unlikely(ret))
- return ret;
-
- ret = trans_lock_write(trans);
+ ret = bch2_trans_lock_write(trans);
if (unlikely(ret))
return ret;
@@ -847,10 +880,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans);
- trans_for_each_update(trans, i)
- if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write_inlined(trans, i->path,
- insert_l(i)->b);
+ bch2_trans_unlock_write(trans);
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
@@ -1003,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
struct btree_write_buffered_key *wb;
- unsigned u64s;
int ret = 0;
if (!trans->nr_updates &&
@@ -1063,13 +1092,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
- memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
trans->journal_u64s = trans->extra_journal_entries.nr;
- trans->journal_preres_u64s = 0;
-
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-
if (trans->journal_transaction_names)
trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
@@ -1085,16 +1109,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (i->key_cache_already_flushed)
continue;
- /* we're going to journal the key being updated: */
- u64s = jset_u64s(i->k->k.u64s);
- if (i->cached &&
- likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
- trans->journal_preres_u64s += u64s;
-
if (i->flags & BTREE_UPDATE_NOJOURNAL)
continue;
- trans->journal_u64s += u64s;
+ /* we're going to journal the key being updated: */
+ trans->journal_u64s += jset_u64s(i->k->k.u64s);
/* and we're also going to log the overwrite: */
if (trans->journal_transaction_names)
@@ -1126,8 +1145,6 @@ retry:
trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
- bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 941841a0c5bf..60453ba86c4b 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -5,7 +5,7 @@
#include <linux/list.h>
#include <linux/rhashtable.h>
-//#include "bkey_methods.h"
+#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
#include "errcode.h"
@@ -312,31 +312,6 @@ struct btree_iter {
#endif
};
-struct btree_key_cache_freelist {
- struct bkey_cached *objs[16];
- unsigned nr;
-};
-
-struct btree_key_cache {
- struct mutex lock;
- struct rhashtable table;
- bool table_init_done;
- struct list_head freed_pcpu;
- struct list_head freed_nonpcpu;
- struct shrinker *shrink;
- unsigned shrink_iter;
- struct btree_key_cache_freelist __percpu *pcpu_freed;
-
- atomic_long_t nr_freed;
- atomic_long_t nr_keys;
- atomic_long_t nr_dirty;
-};
-
-struct bkey_cached_key {
- u32 btree_id;
- struct bpos pos;
-} __packed __aligned(4);
-
#define BKEY_CACHED_ACCESSED 0
#define BKEY_CACHED_DIRTY 1
@@ -352,7 +327,6 @@ struct bkey_cached {
struct rhash_head hash;
struct list_head list;
- struct journal_preres res;
struct journal_entry_pin journal;
u64 seq;
@@ -389,11 +363,7 @@ struct btree_insert_entry {
unsigned long ip_allocated;
};
-#ifndef CONFIG_LOCKDEP
#define BTREE_ITER_MAX 64
-#else
-#define BTREE_ITER_MAX 32
-#endif
struct btree_trans_commit_hook;
typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
@@ -434,6 +404,7 @@ struct btree_trans {
bool journal_transaction_names:1;
bool journal_replay_not_finished:1;
bool notrace_relock_fail:1;
+ bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
unsigned long last_begin_ip;
@@ -465,11 +436,9 @@ struct btree_trans {
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
- struct journal_preres journal_preres;
u64 *journal_seq;
struct disk_reservation *disk_res;
unsigned journal_u64s;
- unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
};
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 39c2db68123b..76f27bc9fa24 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -513,8 +513,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
up_read(&c->gc_lock);
as->took_gc_lock = false;
- bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
bch2_disk_reservation_put(c, &as->disk_res);
@@ -734,8 +732,6 @@ err:
bch2_journal_pin_drop(&c->journal, &as->journal);
- bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
mutex_lock(&c->btree_interior_update_lock);
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
@@ -1047,7 +1043,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned journal_flags = 0;
int ret = 0;
u32 restart_count = trans->restart_count;
@@ -1061,10 +1056,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
- journal_flags |= JOURNAL_RES_GET_NONBLOCK;
- journal_flags |= watermark;
-
while (1) {
nr_nodes[!!update_level] += 1 + split;
update_level++;
@@ -1129,27 +1120,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (ret)
goto err;
- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags|JOURNAL_RES_GET_NONBLOCK);
- if (ret) {
- if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
- goto err;
- }
-
- ret = drop_locks_do(trans,
- bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags));
- if (ret == -BCH_ERR_journal_preres_get_blocked) {
- trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
- }
- if (ret)
- goto err;
- }
-
ret = bch2_disk_reservation_get(c, &as->disk_res,
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 4df21512d640..031076e75fa1 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -55,7 +55,6 @@ struct btree_update {
unsigned update_level;
struct disk_reservation disk_res;
- struct journal_preres journal_preres;
/*
* BTREE_INTERIOR_UPDATING_NODE:
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0771a6d880bf..5ed66202c226 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -239,6 +239,34 @@ restart_drop_extra_replicas:
next_pos = insert->k.p;
+ /*
+ * Check for nonce offset inconsistency:
+ * This is debug code - we've been seeing this bug rarely, and
+ * it's been hard to reproduce, so this should give us some more
+ * information when it does occur:
+ */
+ struct printbuf err = PRINTBUF;
+ int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
+ printbuf_exit(&err);
+
+ if (invalid) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "about to insert invalid key in data update path");
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
+ bch2_fatal_error(c);
+ goto out;
+ }
+
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index d613695abf9f..4d0cb0ccff32 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
case TARGET_DEV: {
struct bch_dev *ca;
+ out->atomic++;
rcu_read_lock();
ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
@@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
}
rcu_read_unlock();
+ out->atomic--;
break;
}
case TARGET_GROUP:
@@ -580,7 +582,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
}
}
-void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct target t = target_decode(v);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 875f7c5a6fca..2a77de18c004 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1373,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
h->nr_active_devs++;
rcu_read_unlock();
+
+ /*
+ * If we only have redundancy + 1 devices, we're better off with just
+ * replication:
+ */
+ if (h->nr_active_devs < h->redundancy + 2)
+ bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
+ h->nr_active_devs, h->redundancy + 2);
+
list_add(&h->list, &c->ec_stripe_head_list);
return h;
}
@@ -1424,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
found:
+ if (!IS_ERR_OR_NULL(h) &&
+ h->nr_active_devs < h->redundancy + 2) {
+ mutex_unlock(&h->lock);
+ h = NULL;
+ }
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
@@ -1681,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
int ret;
h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
- if (!h)
- bch_err(c, "no stripe head");
if (IS_ERR_OR_NULL(h))
return h;
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index 8bd9bcdd27f7..ff664fd0d8ef 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -13,7 +13,7 @@
int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
loff_t start, u64 end,
- int fgp_flags, gfp_t gfp,
+ fgf_t fgp_flags, gfp_t gfp,
folios *fs)
{
struct folio *f;
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index a2222ad586e9..27f712ae37a6 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -7,7 +7,7 @@
typedef DARRAY(struct folio *) folios;
int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
- u64, int, gfp_t, folios *);
+ u64, fgf_t, gfp_t, folios *);
int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
/*
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 166d8d8abe68..8ef817304e4a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1922,10 +1922,7 @@ out:
return dget(sb->s_root);
err_put_super:
- sb->s_fs_info = NULL;
- c->vfs_sb = NULL;
deactivate_locked_super(sb);
- bch2_fs_stop(c);
return ERR_PTR(bch2_err_class(ret));
}
@@ -1933,11 +1930,8 @@ static void bch2_kill_sb(struct super_block *sb)
{
struct bch_fs *c = sb->s_fs_info;
- if (c)
- c->vfs_sb = NULL;
generic_shutdown_super(sb);
- if (c)
- bch2_fs_free(c);
+ bch2_fs_free(c);
}
static struct file_system_type bcache_fs_type = {
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9f3e9bd3d767..e0c5cd119acc 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2220,7 +2220,7 @@ static int nlink_cmp(const void *_l, const void *_r)
const struct nlink *l = _l;
const struct nlink *r = _r;
- return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
+ return cmp_int(l->inum, r->inum);
}
static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index def77f2d8802..c7849b0753e7 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1134,7 +1134,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
* unlinked inodes in the snapshot leaves:
*/
*need_another_pass = true;
- return 0;
+ goto out;
}
ret = 1;
@@ -1169,8 +1169,10 @@ again:
*/
for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p,
- &need_another_pass));
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
if (ret < 0)
break;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index f02b3f7d26a0..d704a8f829c8 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -795,7 +795,7 @@ static int bch2_write_decrypt(struct bch_write_op *op)
* checksum:
*/
csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- if (bch2_crc_cmp(op->crc.csum, csum))
+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
return -EIO;
ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 5b5d69f2316b..23a9b7845d11 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -526,36 +526,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
return ret;
}
-/* journal_preres: */
-
-static bool journal_preres_available(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
-
- if (!ret && mutex_trylock(&j->reclaim_lock)) {
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
- }
-
- return ret;
-}
-
-int __bch2_journal_preres_get(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- int ret;
-
- closure_wait_event(&j->preres_wait,
- (ret = bch2_journal_error(j)) ||
- journal_preres_available(j, res, new_u64s, flags));
- return ret;
-}
-
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *j,
@@ -1306,7 +1276,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 011711e99c8d..c85d01cf4948 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -395,104 +395,6 @@ out:
return 0;
}
-/* journal_preres: */
-
-static inline void journal_set_watermark(struct journal *j)
-{
- union journal_preres_state s = READ_ONCE(j->prereserved);
- unsigned watermark = BCH_WATERMARK_stripe;
-
- if (fifo_free(&j->pin) < j->pin.size / 4)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
- if (fifo_free(&j->pin) < j->pin.size / 8)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
- if (s.reserved > s.remaining)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
- if (!s.remaining)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
- if (watermark == j->watermark)
- return;
-
- swap(watermark, j->watermark);
- if (watermark > j->watermark)
- journal_wake(j);
-}
-
-static inline void bch2_journal_preres_put(struct journal *j,
- struct journal_preres *res)
-{
- union journal_preres_state s = { .reserved = res->u64s };
-
- if (!res->u64s)
- return;
-
- s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
- res->u64s = 0;
-
- if (unlikely(s.waiting)) {
- clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
- (unsigned long *) &j->prereserved.v);
- closure_wake_up(&j->preres_wait);
- }
-
- if (s.reserved <= s.remaining && j->watermark)
- journal_set_watermark(j);
-}
-
-int __bch2_journal_preres_get(struct journal *,
- struct journal_preres *, unsigned, unsigned);
-
-static inline int bch2_journal_preres_get_fast(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags,
- bool set_waiting)
-{
- int d = new_u64s - res->u64s;
- union journal_preres_state old, new;
- u64 v = atomic64_read(&j->prereserved.counter);
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- int ret;
-
- do {
- old.v = new.v = v;
- ret = 0;
-
- if (watermark == BCH_WATERMARK_reclaim ||
- new.reserved + d < new.remaining) {
- new.reserved += d;
- ret = 1;
- } else if (set_waiting && !new.waiting)
- new.waiting = true;
- else
- return 0;
- } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
- old.v, new.v)) != old.v);
-
- if (ret)
- res->u64s += d;
- return ret;
-}
-
-static inline int bch2_journal_preres_get(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- if (new_u64s <= res->u64s)
- return 0;
-
- if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
- return 0;
-
- if (flags & JOURNAL_RES_GET_NONBLOCK)
- return -BCH_ERR_journal_preres_get_blocked;
-
- return __bch2_journal_preres_get(j, res, new_u64s, flags);
-}
-
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index f4bc2cdbfdd7..786a09285509 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1079,6 +1079,12 @@ found:
if (ja->bucket_seq[ja->cur_idx] &&
ja->sectors_free == ca->mi.bucket_size) {
+#if 0
+ /*
+ * Debug code for ZNS support, where we (probably) want to be
+ * correlated where we stopped in the journal to the zone write
+ * points:
+ */
bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
for (i = 0; i < 3; i++) {
@@ -1086,6 +1092,7 @@ found:
bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
}
+#endif
ja->sectors_free = 0;
}
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 9a584aaaa2eb..e63c6eda86af 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -50,16 +50,21 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
return available;
}
-static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+static inline void journal_set_watermark(struct journal *j, bool low_on_space)
{
- union journal_preres_state old, new;
- u64 v = atomic64_read(&j->prereserved.counter);
+ unsigned watermark = BCH_WATERMARK_stripe;
- do {
- old.v = new.v = v;
- new.remaining = u64s_remaining;
- } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
- old.v, new.v)) != old.v);
+ if (low_on_space)
+ watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+ if (fifo_free(&j->pin) < j->pin.size / 4)
+ watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+
+ if (watermark == j->watermark)
+ return;
+
+ swap(watermark, j->watermark);
+ if (watermark > j->watermark)
+ journal_wake(j);
}
static struct journal_space
@@ -162,7 +167,6 @@ void bch2_journal_space_available(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
unsigned clean, clean_ondisk, total;
- s64 u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
unsigned i, nr_online = 0, nr_devs_want;
@@ -222,16 +226,10 @@ void bch2_journal_space_available(struct journal *j)
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
- u64s_remaining = (u64) clean << 6;
- u64s_remaining -= (u64) total << 3;
- u64s_remaining = max(0LL, u64s_remaining);
- u64s_remaining /= 4;
- u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
+ journal_set_watermark(j, clean * 4 <= total);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
- journal_set_remaining(j, u64s_remaining);
- journal_set_watermark(j);
if (!ret)
journal_wake(j);
@@ -555,11 +553,6 @@ static u64 journal_seq_to_flush(struct journal *j)
/* Try to keep the journal at most half full: */
nr_buckets = ja->nr / 2;
- /* And include pre-reservations: */
- nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
- (ca->mi.bucket_size << 6) -
- journal_entry_overhead(j));
-
nr_buckets = min(nr_buckets, ja->nr);
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
@@ -638,10 +631,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
msecs_to_jiffies(c->opts.journal_reclaim_delay)))
min_nr = 1;
- if (j->prereserved.reserved * 4 > j->prereserved.remaining)
- min_nr = 1;
-
- if (fifo_free(&j->pin) <= 32)
+ if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
@@ -652,8 +642,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
- j->prereserved.reserved,
- j->prereserved.remaining,
atomic_read(&c->btree_cache.dirty),
c->btree_cache.used,
atomic_long_read(&c->btree_key_cache.nr_dirty),
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 42504e16acb6..a756b69582e3 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -76,14 +76,6 @@ struct journal_res {
u64 seq;
};
-/*
- * For reserving space in the journal prior to getting a reservation on a
- * particular journal entry:
- */
-struct journal_preres {
- unsigned u64s;
-};
-
union journal_res_state {
struct {
atomic64_t counter;
@@ -104,22 +96,6 @@ union journal_res_state {
};
};
-union journal_preres_state {
- struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
- struct {
- u64 waiting:1,
- reserved:31,
- remaining:32;
- };
-};
-
/* bytes: */
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
@@ -180,8 +156,6 @@ struct journal {
union journal_res_state reservations;
enum bch_watermark watermark;
- union journal_preres_state prereserved;
-
} __aligned(SMP_CACHE_BYTES);
unsigned long flags;
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index b775cf0fb7cb..97790445e67a 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -163,8 +163,11 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
this_cpu_sub(*lock->readers, !ret);
preempt_enable();
- if (!ret && (old & SIX_LOCK_WAITING_write))
- ret = -1 - SIX_LOCK_write;
+ if (!ret) {
+ smp_mb();
+ if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
+ ret = -1 - SIX_LOCK_write;
+ }
} else if (type == SIX_LOCK_write && lock->readers) {
if (try) {
atomic_add(SIX_LOCK_HELD_write, &lock->state);
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 86833445af20..2d2e66a4e468 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -20,7 +20,7 @@ struct snapshot_t {
};
struct snapshot_table {
- struct snapshot_t s[0];
+ DECLARE_FLEX_ARRAY(struct snapshot_t, s);
};
typedef struct {
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 893304a1f06e..7857671159b4 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write,
TRACE_EVENT(journal_reclaim_start,
TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
u64 min_nr, u64 min_key_cache,
- u64 prereserved, u64 prereserved_total,
u64 btree_cache_dirty, u64 btree_cache_total,
u64 btree_key_cache_dirty, u64 btree_key_cache_total),
- TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
+ TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
btree_cache_dirty, btree_cache_total,
btree_key_cache_dirty, btree_key_cache_total),
@@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start,
__field(bool, kicked )
__field(u64, min_nr )
__field(u64, min_key_cache )
- __field(u64, prereserved )
- __field(u64, prereserved_total )
__field(u64, btree_cache_dirty )
__field(u64, btree_cache_total )
__field(u64, btree_key_cache_dirty )
@@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start,
__entry->kicked = kicked;
__entry->min_nr = min_nr;
__entry->min_key_cache = min_key_cache;
- __entry->prereserved = prereserved;
- __entry->prereserved_total = prereserved_total;
__entry->btree_cache_dirty = btree_cache_dirty;
__entry->btree_cache_total = btree_cache_total;
__entry->btree_key_cache_dirty = btree_key_cache_dirty;
__entry->btree_key_cache_total = btree_key_cache_total;
),
- TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+ TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->direct,
__entry->kicked,
__entry->min_nr,
__entry->min_key_cache,
- __entry->prereserved,
- __entry->prereserved_total,
__entry->btree_cache_dirty,
__entry->btree_cache_total,
__entry->btree_key_cache_dirty,
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index a39ff0c296ec..79d982674c18 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -552,6 +552,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
s.v = v + 1;
s.defined = true;
} else {
+ /*
+ * Check if this option was set on the parent - if so, switched
+ * back to inheriting from the parent:
+ *
+ * rename() also has to deal with keeping inherited options up
+ * to date - see bch2_reinherit_attrs()
+ */
+ spin_lock(&dentry->d_lock);
if (!IS_ROOT(dentry)) {
struct bch_inode_info *dir =
to_bch_ei(d_inode(dentry->d_parent));
@@ -560,6 +568,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
} else {
s.v = 0;
}
+ spin_unlock(&dentry->d_lock);
s.defined = false;
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2a9344a3fcee..35c1d24d4a78 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -432,7 +432,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (btrfs_block_can_be_shared(trans, root, buf)) {
ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
btrfs_header_level(buf), 1,
- &refs, &flags);
+ &refs, &flags, NULL);
if (ret)
return ret;
if (unlikely(refs == 0)) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9223934d95f4..891ea2fa263c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -1041,7 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
+ if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
@@ -1144,7 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
+ if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c8e5b4715b49..0455935ff558 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -102,7 +102,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
*/
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags)
+ u64 offset, int metadata, u64 *refs, u64 *flags,
+ u64 *owning_root)
{
struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
@@ -114,6 +115,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
u32 item_size;
u64 num_refs;
u64 extent_flags;
+ u64 owner = 0;
int ret;
/*
@@ -167,6 +169,8 @@ search_again:
struct btrfs_extent_item);
num_refs = btrfs_extent_refs(leaf, ei);
extent_flags = btrfs_extent_flags(leaf, ei);
+ owner = btrfs_get_extent_owner_root(fs_info, leaf,
+ path->slots[0]);
} else {
ret = -EUCLEAN;
btrfs_err(fs_info,
@@ -226,6 +230,8 @@ out:
*refs = num_refs;
if (flags)
*flags = extent_flags;
+ if (owning_root)
+ *owning_root = owner;
out_free:
btrfs_free_path(path);
return ret;
@@ -5234,7 +5240,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
/* We don't lock the tree block, it's OK to be racy here */
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
wc->level - 1, 1, &refs,
- &flags);
+ &flags, NULL);
/* We don't care about errors in readahead. */
if (ret < 0)
continue;
@@ -5301,7 +5307,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level],
+ NULL);
BUG_ON(ret == -ENOMEM);
if (ret)
return ret;
@@ -5391,6 +5398,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 bytenr;
u64 generation;
u64 parent;
+ u64 owner_root = 0;
struct btrfs_tree_parent_check check = { 0 };
struct btrfs_key key;
struct btrfs_ref ref = { 0 };
@@ -5434,7 +5442,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
- &wc->flags[level - 1]);
+ &wc->flags[level - 1],
+ &owner_root);
if (ret < 0)
goto out_unlock;
@@ -5567,8 +5576,7 @@ skip:
find_next_key(path, level, &wc->drop_progress);
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- fs_info->nodesize, parent,
- btrfs_header_owner(next));
+ fs_info->nodesize, parent, owner_root);
btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
0, false);
ret = btrfs_free_extent(trans, &ref);
@@ -5635,7 +5643,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level],
+ NULL);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
@@ -5880,7 +5889,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
level, 1, &wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level], NULL);
if (ret < 0) {
err = ret;
goto out_end_trans;
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 0716f65d9753..2e066035ccee 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -99,7 +99,8 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags);
+ u64 offset, int metadata, u64 *refs, u64 *flags,
+ u64 *owner_root);
int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5e3fccddde0c..9f5a9894f88f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6983,8 +6983,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
int ret;
alloc_hint = get_extent_allocation_hint(inode, start, len);
+again:
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
0, alloc_hint, &ins, 1, 1);
+ if (ret == -EAGAIN) {
+ ASSERT(btrfs_is_zoned(fs_info));
+ wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ goto again;
+ }
if (ret)
return ERR_PTR(ret);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 752acff2c734..dfe257e1845b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1528,7 +1528,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf,
unsigned long *sk_offset,
int *num_found)
@@ -1660,7 +1660,7 @@ out:
static noinline int search_ioctl(struct inode *inode,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf)
{
struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
@@ -1733,7 +1733,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
struct btrfs_ioctl_search_args __user *uargs = argp;
struct btrfs_ioctl_search_key sk;
int ret;
- size_t buf_size;
+ u64 buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1763,8 +1763,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
struct btrfs_ioctl_search_args_v2 args;
int ret;
- size_t buf_size;
- const size_t buf_limit = SZ_16M;
+ u64 buf_size;
+ const u64 buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index edb84cc03237..ce446d9d7f23 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1888,7 +1888,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
u64 bytenr = record->bytenr;
if (!btrfs_qgroup_full_accounting(fs_info))
- return 0;
+ return 1;
lockdep_assert_held(&delayed_refs->lock);
trace_btrfs_qgroup_trace_extent(fs_info, record);
@@ -2875,12 +2875,18 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
num_bytes, seq);
/*
+ * We're done using the iterator, release all its qgroups while holding
+ * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
+ * and trigger use-after-free accesses to qgroups.
+ */
+ qgroup_iterator_nested_clean(&qgroups);
+
+ /*
* Bump qgroup_seq to avoid seq overlap
*/
fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
spin_unlock(&fs_info->qgroup_lock);
out_free:
- qgroup_iterator_nested_clean(&qgroups);
ulist_free(old_roots);
ulist_free(new_roots);
return ret;
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 944e8f1862aa..9589362acfbf 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -145,7 +145,7 @@ int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
btrfs_put_bioc(bioc);
}
- return ret;
+ return 0;
}
int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9ce5be21b036..f62a408671cb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1868,6 +1868,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
*/
ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
+ /* @found_logical_ret must be specified. */
+ ASSERT(found_logical_ret);
+
stripe = &sctx->stripes[sctx->cur_stripe];
scrub_reset_stripe(stripe);
ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
@@ -1876,8 +1879,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
/* Either >0 as no more extents or <0 for error. */
if (ret)
return ret;
- if (found_logical_ret)
- *found_logical_ret = stripe->logical;
+ *found_logical_ret = stripe->logical;
sctx->cur_stripe++;
/* We filled one group, submit it. */
@@ -2080,7 +2082,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
/* Go through each extent items inside the logical range */
while (cur_logical < logical_end) {
- u64 found_logical;
+ u64 found_logical = U64_MAX;
u64 cur_physical = physical + cur_logical - logical_start;
/* Canceled? */
@@ -2115,6 +2117,8 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
if (ret < 0)
break;
+ /* queue_scrub_stripe() returned 0, @found_logical must be updated. */
+ ASSERT(found_logical != U64_MAX);
cur_logical = found_logical + BTRFS_STRIPE_LEN;
/* Don't hold CPU for too long time */
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c87e18827a0a..c6f16625af51 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -748,13 +748,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (!fs_devices) {
fs_devices = alloc_fs_devices(disk_super->fsid);
+ if (IS_ERR(fs_devices))
+ return ERR_CAST(fs_devices);
+
if (has_metadata_uuid)
memcpy(fs_devices->metadata_uuid,
disk_super->metadata_uuid, BTRFS_FSID_SIZE);
- if (IS_ERR(fs_devices))
- return ERR_CAST(fs_devices);
-
if (same_fsid_diff_dev) {
generate_random_uuid(fs_devices->fsid);
fs_devices->temp_fsid = true;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 3504ade30cb0..188378ca19c7 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1661,13 +1661,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
out:
- if (cache->alloc_offset > fs_info->zone_size) {
- btrfs_err(fs_info,
- "zoned: invalid write pointer %llu in block group %llu",
- cache->alloc_offset, cache->start);
- ret = -EIO;
- }
-
if (cache->alloc_offset > cache->zone_capacity) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 929248c6ca84..4cbe0434cbb8 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,8 +84,8 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn);
void nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
-int nfsd_cache_lookup(struct svc_rqst *rqstp,
- struct nfsd_cacherep **cacherep);
+int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+ unsigned int len, struct nfsd_cacherep **cacherep);
void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
int cachetype, __be32 *statp);
int nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4045c852a450..40415929e2ae 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2804,7 +2804,7 @@ static int client_opens_release(struct inode *inode, struct file *file)
/* XXX: alternatively, we could get/drop in seq start/stop */
drop_client(clp);
- return 0;
+ return seq_release(inode, file);
}
static const struct file_operations client_states_fops = {
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index fd56a52aa5fb..d3273a396659 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -369,33 +369,52 @@ nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
return freed;
}
-/*
- * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
+/**
+ * nfsd_cache_csum - Checksum incoming NFS Call arguments
+ * @buf: buffer containing a whole RPC Call message
+ * @start: starting byte of the NFS Call header
+ * @remaining: size of the NFS Call header, in bytes
+ *
+ * Compute a weak checksum of the leading bytes of an NFS procedure
+ * call header to help verify that a retransmitted Call matches an
+ * entry in the duplicate reply cache.
+ *
+ * To avoid assumptions about how the RPC message is laid out in
+ * @buf and what else it might contain (eg, a GSS MIC suffix), the
+ * caller passes us the exact location and length of the NFS Call
+ * header.
+ *
+ * Returns a 32-bit checksum value, as defined in RFC 793.
*/
-static __wsum
-nfsd_cache_csum(struct svc_rqst *rqstp)
+static __wsum nfsd_cache_csum(struct xdr_buf *buf, unsigned int start,
+ unsigned int remaining)
{
+ unsigned int base, len;
+ struct xdr_buf subbuf;
+ __wsum csum = 0;
+ void *p;
int idx;
- unsigned int base;
- __wsum csum;
- struct xdr_buf *buf = &rqstp->rq_arg;
- const unsigned char *p = buf->head[0].iov_base;
- size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
- RC_CSUMLEN);
- size_t len = min(buf->head[0].iov_len, csum_len);
+
+ if (remaining > RC_CSUMLEN)
+ remaining = RC_CSUMLEN;
+ if (xdr_buf_subsegment(buf, &subbuf, start, remaining))
+ return csum;
/* rq_arg.head first */
- csum = csum_partial(p, len, 0);
- csum_len -= len;
+ if (subbuf.head[0].iov_len) {
+ len = min_t(unsigned int, subbuf.head[0].iov_len, remaining);
+ csum = csum_partial(subbuf.head[0].iov_base, len, csum);
+ remaining -= len;
+ }
/* Continue into page array */
- idx = buf->page_base / PAGE_SIZE;
- base = buf->page_base & ~PAGE_MASK;
- while (csum_len) {
- p = page_address(buf->pages[idx]) + base;
- len = min_t(size_t, PAGE_SIZE - base, csum_len);
+ idx = subbuf.page_base / PAGE_SIZE;
+ base = subbuf.page_base & ~PAGE_MASK;
+ while (remaining) {
+ p = page_address(subbuf.pages[idx]) + base;
+ len = min_t(unsigned int, PAGE_SIZE - base, remaining);
csum = csum_partial(p, len, csum);
- csum_len -= len;
+ remaining -= len;
base = 0;
++idx;
}
@@ -466,6 +485,8 @@ out:
/**
* nfsd_cache_lookup - Find an entry in the duplicate reply cache
* @rqstp: Incoming Call to find
+ * @start: starting byte in @rqstp->rq_arg of the NFS Call header
+ * @len: size of the NFS Call header, in bytes
* @cacherep: OUT: DRC entry for this request
*
* Try to find an entry matching the current call in the cache. When none
@@ -479,7 +500,8 @@ out:
* %RC_REPLY: Reply from cache
* %RC_DROPIT: Do not process the request further
*/
-int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
+int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+ unsigned int len, struct nfsd_cacherep **cacherep)
{
struct nfsd_net *nn;
struct nfsd_cacherep *rp, *found;
@@ -495,7 +517,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
goto out;
}
- csum = nfsd_cache_csum(rqstp);
+ csum = nfsd_cache_csum(&rqstp->rq_arg, start, len);
/*
* Since the common case is a cache miss followed by an insert,
@@ -641,24 +663,17 @@ void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
return;
}
-/*
- * Copy cached reply to current reply buffer. Should always fit.
- * FIXME as reply is in a page, we should just attach the page, and
- * keep a refcount....
- */
static int
nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
{
- struct kvec *vec = &rqstp->rq_res.head[0];
-
- if (vec->iov_len + data->iov_len > PAGE_SIZE) {
- printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n",
- data->iov_len);
- return 0;
- }
- memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
- vec->iov_len += data->iov_len;
- return 1;
+ __be32 *p;
+
+ p = xdr_reserve_space(&rqstp->rq_res_stream, data->iov_len);
+ if (unlikely(!p))
+ return false;
+ memcpy(p, data->iov_base, data->iov_len);
+ xdr_commit_encode(&rqstp->rq_res_stream);
+ return true;
}
/*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d6122bb2d167..fe61d9bbcc1f 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -981,6 +981,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
const struct svc_procedure *proc = rqstp->rq_procinfo;
__be32 *statp = rqstp->rq_accept_statp;
struct nfsd_cacherep *rp;
+ unsigned int start, len;
+ __be32 *nfs_reply;
/*
* Give the xdr decoder a chance to change this if it wants
@@ -988,6 +990,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
*/
rqstp->rq_cachetype = proc->pc_cachetype;
+ /*
+ * ->pc_decode advances the argument stream past the NFS
+ * Call header, so grab the header's starting location and
+ * size now for the call to nfsd_cache_lookup().
+ */
+ start = xdr_stream_pos(&rqstp->rq_arg_stream);
+ len = xdr_stream_remaining(&rqstp->rq_arg_stream);
if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;
@@ -1001,7 +1010,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
rp = NULL;
- switch (nfsd_cache_lookup(rqstp, &rp)) {
+ switch (nfsd_cache_lookup(rqstp, start, len, &rp)) {
case RC_DOIT:
break;
case RC_REPLY:
@@ -1010,6 +1019,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
goto out_dropit;
}
+ nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0);
*statp = proc->pc_func(rqstp);
if (test_bit(RQ_DROPME, &rqstp->rq_flags))
goto out_update_drop;
@@ -1023,7 +1033,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
*/
smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
- nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
+ nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply);
out_cached_reply:
return 1;
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index ddab9ea267d1..3fe2dde1598f 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -430,7 +430,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
struct ovl_fs_context *ctx = fc->fs_private;
struct ovl_fs_context_layer *l;
char *dup = NULL, *iter;
- ssize_t nr_lower = 0, nr = 0, nr_data = 0;
+ ssize_t nr_lower, nr;
bool data_layer = false;
/*
@@ -482,6 +482,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
iter = dup;
l = ctx->lower;
for (nr = 0; nr < nr_lower; nr++, l++) {
+ ctx->nr++;
memset(l, 0, sizeof(*l));
err = ovl_mount_dir(iter, &l->path);
@@ -498,10 +499,10 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
goto out_put;
if (data_layer)
- nr_data++;
+ ctx->nr_data++;
/* Calling strchr() again would overrun. */
- if ((nr + 1) == nr_lower)
+ if (ctx->nr == nr_lower)
break;
err = -EINVAL;
@@ -511,7 +512,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
* This is a regular layer so we require that
* there are no data layers.
*/
- if ((ctx->nr_data + nr_data) > 0) {
+ if (ctx->nr_data > 0) {
pr_err("regular lower layers cannot follow data lower layers");
goto out_put;
}
@@ -524,8 +525,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
data_layer = true;
iter++;
}
- ctx->nr = nr_lower;
- ctx->nr_data += nr_data;
kfree(dup);
return 0;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 50a201e9cd39..c3f020ca13a8 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -978,7 +978,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper,
return 0;
}
-/**
+/*
* Caller must hold a reference to inode to prevent it from being freed while
* it is marked inuse.
*/
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 6f3285f1dfee..af7849e5974f 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -64,8 +64,8 @@ struct key_type cifs_spnego_key_type = {
* strlen(";sec=ntlmsspi") */
#define MAX_MECH_STR_LEN 13
-/* strlen of "host=" */
-#define HOST_KEY_LEN 5
+/* strlen of ";host=" */
+#define HOST_KEY_LEN 6
/* strlen of ";ip4=" or ";ip6=" */
#define IP_KEY_LEN 5
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 57c2a7df3457..f896f60c924b 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -2065,6 +2065,12 @@ void __cifs_put_smb_ses(struct cifs_ses *ses)
ses->chans[i].server = NULL;
}
+ /* we now account for primary channel in iface->refcount */
+ if (ses->chans[0].iface) {
+ kref_put(&ses->chans[0].iface->refcount, release_iface);
+ ses->chans[0].server = NULL;
+ }
+
sesInfoFree(ses);
cifs_put_tcp_session(server, 0);
}
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 0bb2ac929061..8b2d7c1ca428 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -322,28 +322,32 @@ cifs_disable_secondary_channels(struct cifs_ses *ses)
iface = ses->chans[i].iface;
server = ses->chans[i].server;
+ /*
+ * remove these references first, since we need to unlock
+ * the chan_lock here, since iface_lock is a higher lock
+ */
+ ses->chans[i].iface = NULL;
+ ses->chans[i].server = NULL;
+ spin_unlock(&ses->chan_lock);
+
if (iface) {
spin_lock(&ses->iface_lock);
kref_put(&iface->refcount, release_iface);
- ses->chans[i].iface = NULL;
iface->num_channels--;
if (iface->weight_fulfilled)
iface->weight_fulfilled--;
spin_unlock(&ses->iface_lock);
}
- spin_unlock(&ses->chan_lock);
- if (server && !server->terminate) {
- server->terminate = true;
- cifs_signal_cifsd_for_reconnect(server, false);
- }
- spin_lock(&ses->chan_lock);
-
if (server) {
- ses->chans[i].server = NULL;
+ if (!server->terminate) {
+ server->terminate = true;
+ cifs_signal_cifsd_for_reconnect(server, false);
+ }
cifs_put_tcp_session(server, false);
}
+ spin_lock(&ses->chan_lock);
}
done:
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 84ea67301303..5a3ca62d2f07 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -458,6 +458,8 @@ generate_smb3signingkey(struct cifs_ses *ses,
ptriplet->encryption.context,
ses->smb3encryptionkey,
SMB3_ENC_DEC_KEY_SIZE);
+ if (rc)
+ return rc;
rc = generate_key(ses, ptriplet->decryption.label,
ptriplet->decryption.context,
ses->smb3decryptionkey,
@@ -466,9 +468,6 @@ generate_smb3signingkey(struct cifs_ses *ses,
return rc;
}
- if (rc)
- return rc;
-
#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__);
/*
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index ed0bc8cbc703..567fb37274d3 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
depends on XFS_ONLINE_SCRUB
- select XFS_DEBUG
+ select DEBUG_FS
help
If you say Y here, the kernel will gather usage data about
the online metadata check subsystem. This includes the number
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3069194527dd..100ab5931b31 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2275,16 +2275,37 @@ xfs_alloc_min_freelist(
ASSERT(mp->m_alloc_maxlevels > 0);
+ /*
+ * For a btree shorter than the maximum height, the worst case is that
+ * every level gets split and a new level is added, then while inserting
+ * another entry to refill the AGFL, every level under the old root gets
+ * split again. This is:
+ *
+ * (full height split reservation) + (AGFL refill split height)
+ * = (current height + 1) + (current height - 1)
+ * = (new height) + (new height - 2)
+ * = 2 * new height - 2
+ *
+ * For a btree of maximum height, the worst case is that every level
+ * under the root gets split, then while inserting another entry to
+ * refill the AGFL, every level under the root gets split again. This is
+ * also:
+ *
+ * 2 * (current height - 1)
+ * = 2 * (new height - 1)
+ * = 2 * new height - 2
+ */
+
/* space needed by-bno freespace btree */
min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
- mp->m_alloc_maxlevels);
+ mp->m_alloc_maxlevels) * 2 - 2;
/* space needed by-size freespace btree */
min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
- mp->m_alloc_maxlevels);
+ mp->m_alloc_maxlevels) * 2 - 2;
/* space needed reverse mapping used space btree */
if (xfs_has_rmapbt(mp))
min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
- mp->m_rmap_maxlevels);
+ mp->m_rmap_maxlevels) * 2 - 2;
return min_free;
}
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index bcfb6a4203cd..f71679ce23b9 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -245,21 +245,18 @@ xfs_defer_create_intents(
return ret;
}
-/* Abort all the intents that were committed. */
STATIC void
-xfs_defer_trans_abort(
- struct xfs_trans *tp,
- struct list_head *dop_pending)
+xfs_defer_pending_abort(
+ struct xfs_mount *mp,
+ struct list_head *dop_list)
{
struct xfs_defer_pending *dfp;
const struct xfs_defer_op_type *ops;
- trace_xfs_defer_trans_abort(tp, _RET_IP_);
-
/* Abort intent items that don't have a done item. */
- list_for_each_entry(dfp, dop_pending, dfp_list) {
+ list_for_each_entry(dfp, dop_list, dfp_list) {
ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
+ trace_xfs_defer_pending_abort(mp, dfp);
if (dfp->dfp_intent && !dfp->dfp_done) {
ops->abort_intent(dfp->dfp_intent);
dfp->dfp_intent = NULL;
@@ -267,6 +264,16 @@ xfs_defer_trans_abort(
}
}
+/* Abort all the intents that were committed. */
+STATIC void
+xfs_defer_trans_abort(
+ struct xfs_trans *tp,
+ struct list_head *dop_pending)
+{
+ trace_xfs_defer_trans_abort(tp, _RET_IP_);
+ xfs_defer_pending_abort(tp->t_mountp, dop_pending);
+}
+
/*
* Capture resources that the caller said not to release ("held") when the
* transaction commits. Caller is responsible for zero-initializing @dres.
@@ -756,12 +763,13 @@ xfs_defer_ops_capture(
/* Release all resources that we used to capture deferred ops. */
void
-xfs_defer_ops_capture_free(
+xfs_defer_ops_capture_abort(
struct xfs_mount *mp,
struct xfs_defer_capture *dfc)
{
unsigned short i;
+ xfs_defer_pending_abort(mp, &dfc->dfc_dfops);
xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
@@ -802,7 +810,7 @@ xfs_defer_ops_capture_and_commit(
/* Commit the transaction and add the capture structure to the list. */
error = xfs_trans_commit(tp);
if (error) {
- xfs_defer_ops_capture_free(mp, dfc);
+ xfs_defer_ops_capture_abort(mp, dfc);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 114a3a4930a3..8788ad5f6a73 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -121,7 +121,7 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
struct list_head *capture_list);
void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
struct xfs_defer_resources *dres);
-void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+void xfs_defer_ops_capture_abort(struct xfs_mount *mp,
struct xfs_defer_capture *d);
void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 543f3748c2a3..137a65bda95d 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -510,6 +510,9 @@ xfs_dinode_verify(
if (mode && nextents + naextents > nblocks)
return __this_address;
+ if (nextents + naextents == 0 && nblocks != 0)
+ return __this_address;
+
if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 0e5dba2343ea..144198a6b270 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -286,6 +286,7 @@ xlog_recover_inode_commit_pass2(
struct xfs_log_dinode *ldip;
uint isize;
int need_free = 0;
+ xfs_failaddr_t fa;
if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
in_f = item->ri_buf[0].i_addr;
@@ -369,24 +370,26 @@ xlog_recover_inode_commit_pass2(
* superblock flag to determine whether we need to look at di_flushiter
* to skip replay when the on disk inode is newer than the log one
*/
- if (!xfs_has_v3inodes(mp) &&
- ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
- /*
- * Deal with the wrap case, DI_MAX_FLUSH is less
- * than smaller numbers
- */
- if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
- ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
- /* do nothing */
- } else {
- trace_xfs_log_recover_inode_skip(log, in_f);
- error = 0;
- goto out_release;
+ if (!xfs_has_v3inodes(mp)) {
+ if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * Deal with the wrap case, DI_MAX_FLUSH is less
+ * than smaller numbers
+ */
+ if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+ ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+ /* do nothing */
+ } else {
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto out_release;
+ }
}
+
+ /* Take the opportunity to reset the flush iteration count */
+ ldip->di_flushiter = 0;
}
- /* Take the opportunity to reset the flush iteration count */
- ldip->di_flushiter = 0;
if (unlikely(S_ISREG(ldip->di_mode))) {
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -528,8 +531,19 @@ out_owner_change:
(dip->di_mode != 0))
error = xfs_recover_inode_owner_change(mp, dip, in_f,
buffer_list);
- /* re-generate the checksum. */
+ /* re-generate the checksum and validate the recovered inode. */
xfs_dinode_calc_crc(log->l_mp, dip);
+ fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip);
+ if (fa) {
+ XFS_CORRUPTION_ERROR(
+ "Bad dinode after recovery",
+ XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip));
+ xfs_alert(mp,
+ "Metadata corruption detected at %pS, inode 0x%llx",
+ fa, in_f->ilf_ino);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
ASSERT(bp->b_mount == mp);
bp->b_flags |= _XBF_LOGRECOVERY;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 51c100c86177..ee206facf0dc 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1893,9 +1893,7 @@ xlog_write_iclog(
* the buffer manually, the code needs to be kept in sync
* with the I/O completion path.
*/
- xlog_state_done_syncing(iclog);
- up(&iclog->ic_sema);
- return;
+ goto sync;
}
/*
@@ -1925,20 +1923,17 @@ xlog_write_iclog(
* avoid shutdown re-entering this path and erroring out again.
*/
if (log->l_targ != log->l_mp->m_ddev_targp &&
- blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) {
- xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
- return;
- }
+ blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev))
+ goto shutdown;
}
if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
iclog->ic_bio.bi_opf |= REQ_FUA;
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
- if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
- xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
- return;
- }
+ if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
+ goto shutdown;
+
if (is_vmalloc_addr(iclog->ic_data))
flush_kernel_vmap_range(iclog->ic_data, count);
@@ -1959,6 +1954,12 @@ xlog_write_iclog(
}
submit_bio(&iclog->ic_bio);
+ return;
+shutdown:
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+sync:
+ xlog_state_done_syncing(iclog);
+ up(&iclog->ic_sema);
}
/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13b94d2e605b..a1e18b24971a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2511,7 +2511,7 @@ xlog_abort_defer_ops(
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
list_del_init(&dfc->dfc_list);
- xfs_defer_ops_capture_free(mp, dfc);
+ xfs_defer_ops_capture_abort(mp, dfc);
}
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 658edee8381d..e5b62dc28466 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -784,6 +784,7 @@ xfs_reflink_end_cow_extent(
}
}
del = got;
+ xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb);
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;