diff options
Diffstat (limited to 'fs')
92 files changed, 1109 insertions, 591 deletions
diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 29281b7c3887..0d6138bee2a3 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -49,9 +49,6 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry) static inline void v9fs_fid_add_modes(struct p9_fid *fid, unsigned int s_flags, unsigned int s_cache, unsigned int f_flags) { - if (fid->qid.type != P9_QTFILE) - return; - if ((!s_cache) || ((fid->qid.version == 0) && !(s_flags & V9FS_IGNORE_QV)) || (s_flags & V9FS_DIRECT_IO) || (f_flags & O_DIRECT)) { diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index abdbbaee5184..348cc90bf9c5 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -520,6 +520,7 @@ const struct file_operations v9fs_file_operations = { .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync, + .setlease = simple_nosetlease, }; const struct file_operations v9fs_file_operations_dotl = { @@ -534,4 +535,5 @@ const struct file_operations v9fs_file_operations_dotl = { .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, + .setlease = simple_nosetlease, }; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b01b1bbf2493..47bd77199e20 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -83,7 +83,7 @@ static int p9mode2perm(struct v9fs_session_info *v9ses, int res; int mode = stat->mode; - res = mode & S_IALLUGO; + res = mode & 0777; /* S_IRWXUGO */ if (v9fs_proto_dotu(v9ses)) { if ((mode & P9_DMSETUID) == P9_DMSETUID) res |= S_ISUID; @@ -178,6 +178,9 @@ int v9fs_uflags2omode(int uflags, int extended) break; } + if (uflags & O_TRUNC) + ret |= P9_OTRUNC; + if (extended) { if (uflags & O_EXCL) ret |= P9_OEXCL; @@ -1061,8 +1064,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct v9fs_session_info *v9ses = sb->s_fs_info; struct v9fs_inode *v9inode = V9FS_I(inode); - set_nlink(inode, 1); - inode_set_atime(inode, stat->atime, 0); inode_set_mtime(inode, stat->mtime, 0); inode_set_ctime(inode, stat->mtime, 0); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 4236058c7bbd..55e67e36ae68 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -244,6 +244,21 @@ done: return res; } +static int v9fs_drop_inode(struct inode *inode) +{ + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) + return generic_drop_inode(inode); + /* + * in case of non cached mode always drop the + * inode because we want the inode attribute + * to always match that on the server. + */ + return 1; +} + static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { @@ -268,6 +283,7 @@ static const struct super_operations v9fs_super_ops = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = simple_statfs, + .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, @@ -278,6 +294,7 @@ static const struct super_operations v9fs_super_ops_dotl = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = v9fs_statfs, + .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 3640f417cce1..5c180fdc3efb 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -281,7 +281,6 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; - struct bkey_s_c_xattr xattr; struct posix_acl *acl = NULL; struct bkey_s_c k; int ret; @@ -290,28 +289,27 @@ retry: ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, inode_inum(inode), &search, 0); - if (ret) { - if (!bch2_err_matches(ret, ENOENT)) - acl = ERR_PTR(ret); - goto out; - } + if (ret) + goto err; k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); - if (ret) { - acl = ERR_PTR(ret); - goto out; - } + if (ret) + goto err; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); + le16_to_cpu(xattr.v->x_val_len)); + ret = PTR_ERR_OR_ZERO(acl); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; - if (!IS_ERR(acl)) + if (ret) + acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL; + + if (!IS_ERR_OR_NULL(acl)) set_cached_acl(&inode->v, type, acl); -out: - if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart)) - goto retry; bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 114328acde72..fadb1078903d 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -49,13 +49,15 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, if (!bch2_dev_exists2(c, bp.k->p.inode)) return 0; + struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode); struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); int ret = 0; - bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), + bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || + !bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), c, err, - backpointer_pos_wrong, - "backpointer at wrong pos"); + backpointer_bucket_offset_wrong, + "backpointer bucket_offset wrong"); fsck_err: return ret; } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index da012ca7daee..85949b9fd880 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -53,14 +53,11 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, u64 bucket_offset) { struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); - struct bpos ret; - - ret = POS(bucket.inode, - (bucket_to_sector(ca, bucket.offset) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); + struct bpos ret = POS(bucket.inode, + (bucket_to_sector(ca, bucket.offset) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); - return ret; } diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index a31a5f706929..91c3c1fef233 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -709,6 +709,8 @@ struct btree_trans_buf { x(stripe_delete) \ x(reflink) \ x(fallocate) \ + x(fsync) \ + x(dio_write) \ x(discard) \ x(discard_fast) \ x(invalidate) \ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 63102992d955..085987435a5e 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -578,7 +578,8 @@ struct bch_member { __le64 nbuckets; /* device size */ __le16 first_bucket; /* index of first bucket used */ __le16 bucket_size; /* sectors */ - __le32 pad; + __u8 btree_bitmap_shift; + __u8 pad[3]; __le64 last_mount; /* time_t */ __le64 flags; @@ -587,6 +588,7 @@ struct bch_member { __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; __le64 errors_reset_time; __le64 seq; + __le64 btree_allocated_bitmap; }; #define BCH_MEMBER_V1_BYTES 56 @@ -876,7 +878,8 @@ struct bch_sb_field_downgrade { x(rebalance_work, BCH_VERSION(1, 3)) \ x(member_seq, BCH_VERSION(1, 4)) \ x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ - x(btree_subvolume_children, BCH_VERSION(1, 6)) + x(btree_subvolume_children, BCH_VERSION(1, 6)) \ + x(mi_btree_bitmap, BCH_VERSION(1, 7)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1314,7 +1317,7 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(write_buffer_keys, 11) \ x(datetime, 12) -enum { +enum bch_jset_entry_type { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, BCH_JSET_ENTRY_TYPES() #undef x @@ -1360,7 +1363,7 @@ struct jset_entry_blacklist_v2 { x(inodes, 1) \ x(key_version, 2) -enum { +enum bch_fs_usage_type { #define x(f, nr) BCH_FS_USAGE_##f = nr, BCH_FS_USAGE_TYPES() #undef x @@ -1535,6 +1538,20 @@ enum btree_id { BTREE_ID_NR }; +static inline bool btree_id_is_alloc(enum btree_id id) +{ + switch (id) { + case BTREE_ID_alloc: + case BTREE_ID_backpointers: + case BTREE_ID_need_discard: + case BTREE_ID_freespace: + case BTREE_ID_bucket_gens: + return true; + default: + return false; + } +} + #define BTREE_MAX_DEPTH 4U /* Btree nodes */ diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index cf23ff47bed8..3a45d128f608 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -314,6 +314,12 @@ static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, return bkey_packed(k) ? format->key_u64s : BKEY_U64s; } +static inline bool bkeyp_u64s_valid(const struct bkey_format *f, + const struct bkey_packed *k) +{ + return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s); +} + static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, const struct bkey_packed *k) { diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 5e52684764eb..db336a43fc08 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -171,11 +171,15 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (type >= BKEY_TYPE_NR) return 0; - bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && + bkey_fsck_err_on((type == BKEY_TYPE_btree || + (flags & BKEY_INVALID_COMMIT)) && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", - bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]); + bch2_btree_node_type_str(type), + k.k->type < KEY_TYPE_MAX + ? bch2_bkey_types[k.k->type] + : "(unknown)"); if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { bkey_fsck_err_on(k.k->size == 0, c, err, diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 84474324dba9..02c70e813fac 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -709,9 +709,31 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - u32 seq; - BUG_ON(level + 1 >= BTREE_MAX_DEPTH); + if (unlikely(level >= BTREE_MAX_DEPTH)) { + int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", + level, BTREE_MAX_DEPTH); + return ERR_PTR(ret); + } + + if (unlikely(!bkey_is_btree_ptr(&k->k))) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + + if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + /* * Parent node must be locked, else we could read in a btree node that's * been freed: @@ -752,34 +774,26 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, } set_btree_node_read_in_flight(b); - six_unlock_write(&b->c.lock); - seq = six_lock_seq(&b->c.lock); - six_unlock_intent(&b->c.lock); - /* Unlock before doing IO: */ - if (path && sync) - bch2_trans_unlock_noassert(trans); - - bch2_btree_node_read(trans, b, sync); + if (path) { + u32 seq = six_lock_seq(&b->c.lock); - if (!sync) - return NULL; + /* Unlock before doing IO: */ + six_unlock_intent(&b->c.lock); + bch2_trans_unlock_noassert(trans); - if (path) { - int ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - BUG_ON(!trans->restarted); - return ERR_PTR(ret); - } - } + bch2_btree_node_read(trans, b, sync); - if (!six_relock_type(&b->c.lock, lock_type, seq)) { - BUG_ON(!path); + if (!sync) + return NULL; - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); + if (!six_relock_type(&b->c.lock, lock_type, seq)) + b = NULL; + } else { + bch2_btree_node_read(trans, b, sync); + if (lock_type == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); } return b; @@ -1112,18 +1126,19 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; - struct btree *b; BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); - b = btree_cache_find(bc, k); + struct btree *b = btree_cache_find(bc, k); if (b) return 0; b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); - return PTR_ERR_OR_ZERO(b); + if (!IS_ERR_OR_NULL(b)) + six_unlock_read(&b->c.lock); + return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b); } void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) @@ -1148,6 +1163,8 @@ wait_on_io: btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + if (unlikely(b->hash_val != btree_ptr_hash_val(k))) + goto out; if (btree_node_dirty(b)) { __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); @@ -1162,7 +1179,7 @@ wait_on_io: btree_node_data_free(c, b); bch2_btree_node_hash_remove(bc, b); mutex_unlock(&bc->lock); - +out: six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 6280da1244b5..ecbd9598f69f 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -368,11 +368,16 @@ again: buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); cur = NULL; - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: - bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); if (ret) break; + + if (!btree_id_is_alloc(b->c.btree_id)) { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + break; + } continue; } @@ -544,12 +549,12 @@ reconstruct_root: bch2_btree_root_alloc_fake(c, i, 0); } else { bch2_btree_root_alloc_fake(c, i, 1); + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); if (ret) break; } - bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); reconstructed_root = true; } @@ -823,6 +828,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, struct bch_fs *c = trans->c; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + struct printbuf buf = PRINTBUF; int ret = 0; deleted.p = k->k->p; @@ -843,11 +849,23 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (ret) goto err; + if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k), + c, btree_bitmap_not_marked, + "btree ptr not marked in member info btree allocated bitmap\n %s", + (bch2_bkey_val_to_text(&buf, c, *k), + buf.buf))) { + mutex_lock(&c->sb_lock); + bch2_dev_btree_bitmap_mark(c, *k); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + ret = commit_do(trans, NULL, NULL, 0, bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); fsck_err: err: + printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index d7de82ac3893..9678b2375bed 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -831,7 +831,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); } -static bool __bkey_valid(struct bch_fs *c, struct btree *b, +static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, struct bset *i, struct bkey_packed *k) { if (bkey_p_next(k) > vstruct_last(i)) @@ -840,7 +840,7 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b, if (k->format > KEY_FORMAT_CURRENT) return false; - if (k->u64s < bkeyp_key_u64s(&b->format, k)) + if (!bkeyp_u64s_valid(&b->format, k)) return false; struct printbuf buf = PRINTBUF; @@ -884,11 +884,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, "invalid bkey format %u", k->format)) goto drop_this_key; - if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k), + if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, btree_node_bkey_bad_u64s, - "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k))) + "bad k->u64s %u (min %u max %lu)", k->u64s, + bkeyp_key_u64s(&b->format, k), + U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k))) goto drop_this_key; if (!write) @@ -947,13 +949,12 @@ drop_this_key: * do */ - if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { + if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { for (next_good_key = 1; next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; next_good_key++) - if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) + if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) goto got_good_key; - } /* @@ -1339,7 +1340,9 @@ start: rb->start_time); bio_put(&rb->bio); - if (saw_error && !btree_node_read_error(b)) { + if (saw_error && + !btree_node_read_error(b) && + c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->key.k.p); bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 24772538e4cc..1c70836dd7cc 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -498,8 +498,13 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - if (!trans->restarted) - btree_iter_path(trans, iter)->preserve = false; + if (!iter->path || trans->restarted) + return; + + struct btree_path *path = btree_iter_path(trans, iter); + path->preserve = false; + if (path->ref == 1) + path->should_be_locked = false; } void *__bch2_trans_kmalloc(struct btree_trans *, size_t); @@ -642,7 +647,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *); static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8) + if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8) return __bch2_btree_trans_too_many_iters(trans); return 0; diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 5cbcbfe85235..1e8cf49a6935 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -130,12 +130,30 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); } +static void journal_iter_verify(struct journal_iter *iter) +{ + struct journal_keys *keys = iter->keys; + size_t gap_size = keys->size - keys->nr; + + BUG_ON(iter->idx >= keys->gap && + iter->idx < keys->gap + gap_size); + + if (iter->idx < keys->size) { + struct journal_key *k = keys->data + iter->idx; + + int cmp = cmp_int(k->btree_id, iter->btree_id) ?: + cmp_int(k->level, iter->level); + BUG_ON(cmp < 0); + } +} + static void journal_iters_fix(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; /* The key we just inserted is immediately before the gap: */ size_t gap_end = keys->gap + (keys->size - keys->nr); - struct btree_and_journal_iter *iter; + struct journal_key *new_key = &keys->data[keys->gap - 1]; + struct journal_iter *iter; /* * If an iterator points one after the key we just inserted, decrement @@ -143,9 +161,14 @@ static void journal_iters_fix(struct bch_fs *c) * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will * handle that: */ - list_for_each_entry(iter, &c->journal_iters, journal.list) - if (iter->journal.idx == gap_end) - iter->journal.idx = keys->gap - 1; + list_for_each_entry(iter, &c->journal_iters, list) { + journal_iter_verify(iter); + if (iter->idx == gap_end && + new_key->btree_id == iter->btree_id && + new_key->level == iter->level) + iter->idx = keys->gap - 1; + journal_iter_verify(iter); + } } static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) @@ -192,7 +215,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, if (idx > keys->gap) idx -= keys->size - keys->nr; + size_t old_gap = keys->gap; + if (keys->nr == keys->size) { + journal_iters_move_gap(c, old_gap, keys->size); + old_gap = keys->size; + struct journal_keys new_keys = { .nr = keys->nr, .size = max_t(size_t, keys->size, 8) * 2, @@ -216,7 +244,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, keys->gap = keys->nr; } - journal_iters_move_gap(c, keys->gap, idx); + journal_iters_move_gap(c, old_gap, idx); move_gap(keys, idx); @@ -301,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->keys->data + iter->idx; + journal_iter_verify(iter); + + while (iter->idx < iter->keys->size) { + struct journal_key *k = iter->keys->data + iter->idx; + + int cmp = cmp_int(k->btree_id, iter->btree_id) ?: + cmp_int(k->level, iter->level); + if (cmp > 0) + break; + BUG_ON(cmp); - while (k < iter->keys->data + iter->keys->size && - k->btree_id == iter->btree_id && - k->level == iter->level) { if (!k->overwritten) return bkey_i_to_s_c(k->k); bch2_journal_iter_advance(iter); - k = iter->keys->data + iter->idx; } return bkey_s_c_null; @@ -330,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c, iter->level = level; iter->keys = &c->journal_keys; iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); + + journal_iter_verify(iter); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) @@ -434,10 +469,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, iter->trans = trans; iter->b = b; iter->node_iter = node_iter; - bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); - INIT_LIST_HEAD(&iter->journal.list); iter->pos = b->data->min_key; iter->at_end = false; + INIT_LIST_HEAD(&iter->journal.list); + + if (trans->journal_replay_not_finished) { + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); + if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) + list_add(&iter->journal.list, &trans->c->journal_iters); + } } /* @@ -452,9 +492,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, bch2_btree_node_iter_init_from_start(&node_iter, b); __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); - if (trans->journal_replay_not_finished && - !test_bit(BCH_FS_may_go_rw, &trans->c->flags)) - list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 581edcb0911b..88a3582a3275 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, } else { mutex_lock(&bc->lock); list_move_tail(&ck->list, &bc->freed_pcpu); + bc->nr_freed_pcpu++; mutex_unlock(&bc->lock); } } @@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, if (!list_empty(&bc->freed_pcpu)) { ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_pcpu--; } mutex_unlock(&bc->lock); } @@ -659,7 +661,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, commit_flags |= BCH_WATERMARK_reclaim; if (ck->journal.seq != journal_last_seq(j) || - j->watermark == BCH_WATERMARK_stripe) + !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; ret = bch2_btree_iter_traverse(&b_iter) ?: diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index b9b151e693ed..f2caf491957e 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -440,33 +440,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, struct btree_path *path, struct btree_bkey_cached_common *b) { - struct btree_path *linked; - unsigned i, iter; - int ret; - - /* - * XXX BIG FAT NOTICE - * - * Drop all read locks before taking a write lock: - * - * This is a hack, because bch2_btree_node_lock_write_nofail() is a - * hack - but by dropping read locks first, this should never fail, and - * we only use this in code paths where whatever read locks we've - * already taken are no longer needed: - */ - - trans_for_each_path(trans, linked, iter) { - if (!linked->nodes_locked) - continue; - - for (i = 0; i < BTREE_MAX_DEPTH; i++) - if (btree_node_read_locked(linked, i)) { - btree_node_unlock(trans, linked, i); - btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK); - } - } - - ret = __btree_node_lock_write(trans, path, b, true); + int ret = __btree_node_lock_write(trans, path, b, true); BUG_ON(ret); } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 3f33be7e5e5c..866bd278439f 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -133,6 +133,19 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (le64_to_cpu(bn->magic) != bset_magic(c)) return; + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { + struct nonce nonce = btree_nonce(&bn->keys, 0); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + + bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); + } + + if (btree_id_is_alloc(BTREE_NODE_ID(bn))) + return; + + if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) + return; + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), @@ -192,8 +205,13 @@ static int read_btree_nodes_worker(void *p) last_print = jiffies; } - try_read_btree_node(w->f, ca, bio, buf, - bucket * ca->mi.bucket_size + bucket_offset); + u64 sector = bucket * ca->mi.bucket_size + bucket_offset; + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && + !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) + continue; + + try_read_btree_node(w->f, ca, bio, buf, sector); } err: bio_put(bio); @@ -213,6 +231,9 @@ static int read_btree_nodes(struct find_btree_nodes *f) closure_init_stack(&cl); for_each_online_member(c, ca) { + if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) + continue; + struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); struct task_struct *t; @@ -290,7 +311,7 @@ again: found_btree_node_to_text(&buf, c, n); bch_err(c, "%s", buf.buf); printbuf_exit(&buf); - return -1; + return -BCH_ERR_fsck_repair_unimplemented; } } @@ -436,6 +457,9 @@ bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos node_min, struct bpos node_max) { + if (btree_id_is_alloc(btree)) + return 0; + struct find_btree_nodes *f = &c->found_btree_nodes; int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index aa9da4970740..bbec91e8e650 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -397,12 +397,13 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags struct bkey_cached *ck = (void *) path->l[0].b; unsigned new_u64s; struct bkey_i *new_k; + unsigned watermark = flags & BCH_WATERMARK_MASK; EBUG_ON(path->level); - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(c) && - !(flags & BCH_TRANS_COMMIT_journal_reclaim)) + if (watermark < BCH_WATERMARK_reclaim && + !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(c)) return -BCH_ERR_btree_insert_need_journal_reclaim; /* @@ -499,9 +500,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - struct btree_insert_entry *btree_id_start) + unsigned btree_id_start) { - struct btree_insert_entry *i; bool trans_trigger_run; int ret, overwrite; @@ -514,13 +514,13 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, do { trans_trigger_run = false; - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + for (unsigned i = btree_id_start; + i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; i++) { - if (i->btree_id != btree_id) + if (trans->updates[i].btree_id != btree_id) continue; - ret = run_one_trans_trigger(trans, i, overwrite); + ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); if (ret < 0) return ret; if (ret) @@ -534,8 +534,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - struct btree_insert_entry *btree_id_start = trans->updates; - unsigned btree_id = 0; + unsigned btree_id = 0, btree_id_start = 0; int ret = 0; /* @@ -549,8 +548,8 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) if (btree_id == BTREE_ID_alloc) continue; - while (btree_id_start < trans->updates + trans->nr_updates && - btree_id_start->btree_id < btree_id) + while (btree_id_start < trans->nr_updates && + trans->updates[btree_id_start].btree_id < btree_id) btree_id_start++; ret = run_btree_triggers(trans, btree_id, btree_id_start); @@ -558,11 +557,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) return ret; } - trans_for_each_update(trans, i) { + for (unsigned idx = 0; idx < trans->nr_updates; idx++) { + struct btree_insert_entry *i = trans->updates + idx; + if (i->btree_id > BTREE_ID_alloc) break; if (i->btree_id == BTREE_ID_alloc) { - ret = run_btree_triggers(trans, BTREE_ID_alloc, i); + ret = run_btree_triggers(trans, BTREE_ID_alloc, idx); if (ret) return ret; break; @@ -826,7 +827,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags struct bch_fs *c = trans->c; int ret = 0, u64s_delta = 0; - trans_for_each_update(trans, i) { + for (unsigned idx = 0; idx < trans->nr_updates; idx++) { + struct btree_insert_entry *i = trans->updates + idx; if (i->cached) continue; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 9404d96c38f3..e0c982a4195c 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -364,7 +364,21 @@ struct btree_insert_entry { unsigned long ip_allocated; }; +/* Number of btree paths we preallocate, usually enough */ #define BTREE_ITER_INITIAL 64 +/* + * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code + * paths should run inside this limit, and if they don't it usually indicates a + * bug (leaking/duplicated btree paths). + * + * exception: some fsck paths + * + * bugs with excessive path usage seem to have possibly been eliminated now, so + * we might consider eliminating this (and btree_trans_too_many_iter()) at some + * point. + */ +#define BTREE_ITER_NORMAL_LIMIT 256 +/* never exceed limit */ #define BTREE_ITER_MAX (1U << 10) struct btree_trans_commit_hook; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 32397b99752f..6030c396754f 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -21,14 +21,15 @@ #include "keylist.h" #include "recovery_passes.h" #include "replicas.h" +#include "sb-members.h" #include "super-io.h" #include "trace.h" #include <linux/random.h> -const char * const bch2_btree_update_modes[] = { +static const char * const bch2_btree_update_modes[] = { #define x(t) #t, - BCH_WATERMARKS() + BTREE_UPDATE_MODES() #undef x NULL }; @@ -605,6 +606,26 @@ static void btree_update_add_key(struct btree_update *as, bch2_keylist_push(keys); } +static bool btree_update_new_nodes_marked_sb(struct btree_update *as) +{ + for_each_keylist_key(&as->new_keys, k) + if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) + return false; + return true; +} + +static void btree_update_new_nodes_mark_sb(struct btree_update *as) +{ + struct bch_fs *c = as->c; + + mutex_lock(&c->sb_lock); + for_each_keylist_key(&as->new_keys, k) + bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + /* * The transactional part of an interior btree node update, where we journal the * update we did to the interior node and update alloc info: @@ -662,6 +683,9 @@ static void btree_update_nodes_written(struct btree_update *as) if (ret) goto err; + if (!btree_update_new_nodes_marked_sb(as)) + btree_update_new_nodes_mark_sb(as); + /* * Wait for any in flight writes to finish before we free the old nodes * on disk: @@ -704,9 +728,13 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, "%s", bch2_err_str(ret)); err: - if (as->b) { - - b = as->b; + /* + * We have to be careful because another thread might be getting ready + * to free as->b and calling btree_update_reparent() on us - we'll + * recheck under btree_update_lock below: + */ + b = READ_ONCE(as->b); + if (b) { btree_path_idx_t path_idx = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); struct btree_path *path = trans->paths + path_idx; @@ -850,15 +878,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - BUG_ON(as->mode != BTREE_UPDATE_none); + BUG_ON(as->update_level_end < b->c.level); BUG_ON(!btree_node_dirty(b)); BUG_ON(!b->c.level); + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + as->mode = BTREE_UPDATE_node; as->b = b; + as->update_level_end = b->c.level; set_btree_node_write_blocked(b); list_add(&as->write_blocked_list, &b->write_blocked); @@ -1100,7 +1130,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level, bool split, unsigned flags) + unsigned level_start, bool split, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1108,7 +1138,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2] = { 0, 0 }; - unsigned update_level = level; + unsigned level_end = level_start; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; int ret = 0; u32 restart_count = trans->restart_count; @@ -1123,34 +1153,30 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (watermark < c->journal.watermark) { - struct journal_res res = { 0 }; - unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; + if (watermark < BCH_WATERMARK_reclaim && + test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) { + if (flags & BCH_TRANS_COMMIT_journal_reclaim) + return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); - if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) - journal_flags |= JOURNAL_RES_GET_NONBLOCK; - - ret = drop_locks_do(trans, - bch2_journal_res_get(&c->journal, &res, 1, journal_flags)); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = -BCH_ERR_journal_reclaim_would_deadlock; + bch2_trans_unlock(trans); + wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)); + ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret); } while (1) { - nr_nodes[!!update_level] += 1 + split; - update_level++; + nr_nodes[!!level_end] += 1 + split; + level_end++; - ret = bch2_btree_path_upgrade(trans, path, update_level + 1); + ret = bch2_btree_path_upgrade(trans, path, level_end + 1); if (ret) return ERR_PTR(ret); - if (!btree_path_node(path, update_level)) { + if (!btree_path_node(path, level_end)) { /* Allocating new root? */ nr_nodes[1] += split; - update_level = BTREE_MAX_DEPTH; + level_end = BTREE_MAX_DEPTH; break; } @@ -1158,11 +1184,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead: */ - if (bch2_btree_node_insert_fits(path->l[update_level].b, + if (bch2_btree_node_insert_fits(path->l[level_end].b, BKEY_BTREE_PTR_U64s_MAX * 2)) break; - split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); + split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } if (!down_read_trylock(&c->gc_lock)) { @@ -1176,14 +1202,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); memset(as, 0, sizeof(*as)); closure_init(&as->cl, NULL); - as->c = c; - as->start_time = start_time; - as->ip_started = _RET_IP_; - as->mode = BTREE_UPDATE_none; - as->watermark = watermark; - as->took_gc_lock = true; - as->btree_id = path->btree_id; - as->update_level = update_level; + as->c = c; + as->start_time = start_time; + as->ip_started = _RET_IP_; + as->mode = BTREE_UPDATE_none; + as->watermark = watermark; + as->took_gc_lock = true; + as->btree_id = path->btree_id; + as->update_level_start = level_start; + as->update_level_end = level_end; INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->write_blocked_list); @@ -1277,23 +1304,29 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) bch2_recalc_btree_reserve(c); } -static void bch2_btree_set_root(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +static int bch2_btree_set_root(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + bool nofail) { struct bch_fs *c = as->c; - struct btree *old; trace_and_count(c, btree_node_set_root, trans, b); - old = btree_node_root(c, b); + struct btree *old = btree_node_root(c, b); /* * Ensure no one is using the old root while we switch to the * new root: */ - bch2_btree_node_lock_write_nofail(trans, path, &old->c); + if (nofail) { + bch2_btree_node_lock_write_nofail(trans, path, &old->c); + } else { + int ret = bch2_btree_node_lock_write(trans, path, &old->c); + if (ret) + return ret; + } bch2_btree_set_root_inmem(c, b); @@ -1307,6 +1340,7 @@ static void bch2_btree_set_root(struct btree_update *as, * depend on the new root would have to update the new root. */ bch2_btree_node_unlock_write(trans, path, old); + return 0; } /* Interior node updates: */ @@ -1373,12 +1407,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, } static void -__bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter node_iter, - struct keylist *keys) +bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter node_iter, + struct keylist *keys) { struct bkey_i *insert = bch2_keylist_front(keys); struct bkey_packed *k; @@ -1534,7 +1568,7 @@ static void btree_split_insert_keys(struct btree_update *as, bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); + bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); BUG_ON(bch2_btree_node_check_topology(trans, b)); } @@ -1649,15 +1683,16 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (parent) { /* Split a non root node */ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); - if (ret) - goto err; } else if (n3) { - bch2_btree_set_root(as, trans, trans->paths + path, n3); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, trans, trans->paths + path, n1); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false); } + if (ret) + goto err; + if (n3) { bch2_btree_update_get_open_buckets(as, n3); bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); @@ -1714,27 +1749,6 @@ err: goto out; } -static void -bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct keylist *keys) -{ - struct btree_path *linked; - unsigned i; - - __bch2_btree_insert_keys_interior(as, trans, path, b, - path->l[b->c.level].iter, keys); - - btree_update_updated_node(as, b); - - trans_for_each_path_with_node(trans, b, linked, i) - bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - - bch2_trans_verify_paths(trans); -} - /** * bch2_btree_insert_node - insert bkeys into a given btree node * @@ -1755,7 +1769,8 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t struct keylist *keys) { struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx; + struct btree_path *path = trans->paths + path_idx, *linked; + unsigned i; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -1784,7 +1799,13 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t return ret; } - bch2_btree_insert_keys_interior(as, trans, path, b, keys); + bch2_btree_insert_keys_interior(as, trans, path, b, + path->l[b->c.level].iter, keys); + + trans_for_each_path_with_node(trans, b, linked, i) + bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); + + bch2_trans_verify_paths(trans); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; @@ -1798,6 +1819,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_maybe_compact_whiteouts(c, b)) bch2_trans_node_reinit_iter(trans, b); + btree_update_updated_node(as, b); bch2_btree_node_unlock_write(trans, path, b); BUG_ON(bch2_btree_node_check_topology(trans, b)); @@ -1807,7 +1829,7 @@ split: * We could attempt to avoid the transaction restart, by calling * bch2_btree_path_upgrade() and allocating more nodes: */ - if (b->c.level >= as->update_level) { + if (b->c.level >= as->update_level_end) { trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } @@ -1873,7 +1895,9 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * bch2_keylist_add(&as->parent_keys, &b->key); btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); - bch2_btree_set_root(as, trans, path, n); + int ret = bch2_btree_set_root(as, trans, path, n, true); + BUG_ON(ret); + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); bch2_trans_node_add(trans, path, n); @@ -1926,6 +1950,18 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); + /* + * Work around a deadlock caused by the btree write buffer not doing + * merges and leaving tons of merges for us to do - we really don't need + * to be doing merges at all from the interior update path, and if the + * interior update path is generating too many new interior updates we + * deadlock: + */ + if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) + return 0; + + flags &= ~BCH_WATERMARK_MASK; + b = trans->paths[path].l[level].b; if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || @@ -2071,6 +2107,10 @@ err: bch2_path_put(trans, new_path, true); bch2_path_put(trans, sib_path, true); bch2_trans_verify_locks(trans); + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) + ret = 0; + if (!ret) + ret = bch2_trans_relock(trans); return ret; err_free_update: bch2_btree_node_free_never_used(as, trans, n); @@ -2116,12 +2156,13 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); - if (ret) - goto err; } else { - bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n); + ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false); } + if (ret) + goto err; + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); @@ -2519,9 +2560,11 @@ void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned lev static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) { - prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", (void *) as->ip_started, bch2_btree_id_str(as->btree_id), + as->update_level_start, + as->update_level_end, bch2_watermarks[as->watermark], bch2_btree_update_modes[as->mode], as->nodes_written, diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 88dcf5a22a3b..c1a479ebaad1 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -57,7 +57,8 @@ struct btree_update { unsigned took_gc_lock:1; enum btree_id btree_id; - unsigned update_level; + unsigned update_level_start; + unsigned update_level_end; struct disk_reservation disk_res; diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index baf63e2fddb6..36a6f42aba5e 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -316,6 +316,16 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { bch2_btree_node_unlock_write(trans, path, path->l[0].b); write_locked = false; + + ret = lockrestart_do(trans, + bch2_btree_iter_traverse(&iter) ?: + bch2_foreground_maybe_merge(trans, iter.path, 0, + BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc)); + if (ret) + goto err; } } @@ -382,10 +392,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) ret = commit_do(trans, NULL, NULL, BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim, + BCH_TRANS_COMMIT_no_journal_res , btree_write_buffered_insert(trans, i)); if (ret) goto err; diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 00aaf4bb5139..f9af5adabe83 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -395,14 +395,6 @@ static inline const char *bch2_data_type_str(enum bch_data_type type) : "(invalid data type)"; } -static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type) -{ - if (type < BCH_DATA_NR) - prt_str(out, __bch2_data_types[type]); - else - prt_printf(out, "(invalid data type %u)", type); -} - /* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index cbfa6459bdbc..72781aad6ba7 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -134,42 +134,38 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg struct fsck_thread { struct thread_with_stdio thr; struct bch_fs *c; - char **devs; - size_t nr_devs; struct bch_opts opts; }; static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) { struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); - if (thr->devs) - for (size_t i = 0; i < thr->nr_devs; i++) - kfree(thr->devs[i]); - kfree(thr->devs); kfree(thr); } static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) { struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); - - if (IS_ERR(c)) - return PTR_ERR(c); + struct bch_fs *c = thr->c; - int ret = 0; - if (test_bit(BCH_FS_errors_fixed, &c->flags)) - ret |= 1; - if (test_bit(BCH_FS_error, &c->flags)) - ret |= 4; + int ret = PTR_ERR_OR_ZERO(c); + if (ret) + return ret; - bch2_fs_stop(c); + ret = bch2_fs_start(thr->c); + if (ret) + goto err; - if (ret & 1) + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); - if (ret & 4) + ret |= 1; + } + if (test_bit(BCH_FS_error, &c->flags)) { bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); - + ret |= 4; + } +err: + bch2_fs_stop(c); return ret; } @@ -182,7 +178,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a { struct bch_ioctl_fsck_offline arg; struct fsck_thread *thr = NULL; - u64 *devs = NULL; + darray_str(devs) = {}; long ret = 0; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -194,29 +190,32 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) || - !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) || - !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) { - ret = -ENOMEM; - goto err; - } + for (size_t i = 0; i < arg.nr_devs; i++) { + u64 dev_u64; + ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); + if (ret) + goto err; - thr->opts = bch2_opts_empty(); - thr->nr_devs = arg.nr_devs; + char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); + ret = PTR_ERR_OR_ZERO(dev_str); + if (ret) + goto err; - if (copy_from_user(devs, &user_arg->devs[0], - array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) { - ret = -EINVAL; - goto err; + ret = darray_push(&devs, dev_str); + if (ret) { + kfree(dev_str); + goto err; + } } - for (size_t i = 0; i < arg.nr_devs; i++) { - thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX); - ret = PTR_ERR_OR_ZERO(thr->devs[i]); - if (ret) - goto err; + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; } + thr->opts = bch2_opts_empty(); + if (arg.opts) { char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); @@ -230,15 +229,26 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); + /* We need request_key() to be called before we punt to kthread: */ + opt_set(thr->opts, nostart, true); + + thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + + if (!IS_ERR(thr->c) && + thr->c->opts.errors == BCH_ON_ERROR_panic) + thr->c->opts.errors = BCH_ON_ERROR_ro; + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); -err: - if (ret < 0) { - if (thr) - bch2_fsck_thread_exit(&thr->thr); - pr_err("ret %s", bch2_err_str(ret)); - } - kfree(devs); +out: + darray_for_each(devs, i) + kfree(*i); + darray_exit(&devs); return ret; +err: + if (thr) + bch2_fsck_thread_exit(&thr->thr); + pr_err("ret %s", bch2_err_str(ret)); + goto out; } static long bch2_global_ioctl(unsigned cmd, void __user *arg) diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 4701457f6381..7ed779b411f6 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -429,15 +429,20 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, extent_nonce(version, crc_old), bio); if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { - bch_err(c, "checksum error in %s() (memory corruption or bug?)\n" - "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", - __func__, - crc_old.csum.hi, - crc_old.csum.lo, - merged.hi, - merged.lo, - bch2_csum_types[crc_old.csum_type], - bch2_csum_types[new_csum_type]); + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" + "expected %0llx:%0llx got %0llx:%0llx (old type ", + __func__, + crc_old.csum.hi, + crc_old.csum.lo, + merged.hi, + merged.lo); + bch2_prt_csum_type(&buf, crc_old.csum_type); + prt_str(&buf, " new type "); + bch2_prt_csum_type(&buf, new_csum_type); + prt_str(&buf, ")"); + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); return -EIO; } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 1b8c2c1016dc..e40499fde9a4 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -61,11 +61,12 @@ static inline void bch2_csum_err_msg(struct printbuf *out, struct bch_csum expected, struct bch_csum got) { - prt_printf(out, "checksum error: got "); + prt_str(out, "checksum error, type "); + bch2_prt_csum_type(out, type); + prt_str(out, ": got "); bch2_csum_to_text(out, type, got); prt_str(out, " should be "); bch2_csum_to_text(out, type, expected); - prt_printf(out, " type %s", bch2_csum_types[type]); } int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 58c2eb45570f..607fd5e232c9 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,14 +47,6 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } -static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type) -{ - if (type < BCH_COMPRESSION_TYPE_NR) - prt_str(out, __bch2_compression_types[type]); - else - prt_printf(out, "(invalid compression type %u)", type); -} - int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, struct bch_extent_crc_unpacked *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 34731ee0217f..0022b51ce3c0 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -598,6 +598,8 @@ int bch2_data_update_init(struct btree_trans *trans, i++; } + unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); + /* * If current extent durability is less than io_opts.data_replicas, * we're not trying to rereplicate the extent up to data_replicas here - @@ -607,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans, * rereplicate, currently, so that users don't get an unexpected -ENOSPC */ if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) && - durability_have >= io_opts.data_replicas) { + !durability_required) { m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ @@ -616,11 +618,18 @@ int bch2_data_update_init(struct btree_trans *trans, goto done; } - m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) + + m->op.nr_replicas = min(durability_removing, durability_required) + m->data_opts.extra_replicas; - m->op.nr_replicas_required = m->op.nr_replicas; - BUG_ON(!m->op.nr_replicas); + /* + * If device(s) were set to durability=0 after data was written to them + * we can end up with a duribilty=0 extent, and the normal algorithm + * that tries not to increase durability doesn't work: + */ + if (!(durability_have + durability_removing)) + m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); + + m->op.nr_replicas_required = m->op.nr_replicas; if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 208ce6f0fc43..cd99b7399414 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -13,6 +13,7 @@ #include "btree_iter.h" #include "btree_locking.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "debug.h" #include "error.h" @@ -668,7 +669,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, i->size = size; i->ret = 0; - do { + while (1) { err = flush_buf(i); if (err) return err; @@ -676,9 +677,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, if (!i->size) break; + if (done) + break; + done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); i->iter++; - } while (!done); + } if (i->buf.allocation_failure) return -ENOMEM; @@ -693,13 +697,45 @@ static const struct file_operations journal_pins_ops = { .read = bch2_journal_pins_read, }; +static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (!i->iter) { + bch2_btree_updates_to_text(&i->buf, c); + i->iter++; + } + + err = flush_buf(i); + if (err) + return err; + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations btree_updates_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_updates_read, +}; + static int btree_transaction_stats_open(struct inode *inode, struct file *file) { struct bch_fs *c = inode->i_private; struct dump_iter *i; i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) return -ENOMEM; @@ -866,6 +902,20 @@ void bch2_fs_debug_exit(struct bch_fs *c) debugfs_remove_recursive(c->fs_debug_dir); } +static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd) +{ + struct dentry *d; + + d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir); + + debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops); + + debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops); + + debugfs_create_file("bfloat-failed", 0400, d, bd, + &bfloat_failed_debug_ops); +} + void bch2_fs_debug_init(struct bch_fs *c) { struct btree_debug *bd; @@ -888,6 +938,9 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); + debugfs_create_file("btree_updates", 0400, c->fs_debug_dir, + c->btree_debug, &btree_updates_ops); + debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, c, &btree_transaction_stats_op); @@ -902,21 +955,7 @@ void bch2_fs_debug_init(struct bch_fs *c) bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; - debugfs_create_file(bch2_btree_id_str(bd->id), - 0400, c->btree_debug_dir, bd, - &btree_debug_ops); - - snprintf(name, sizeof(name), "%s-formats", - bch2_btree_id_str(bd->id)); - - debugfs_create_file(name, 0400, c->btree_debug_dir, bd, - &btree_format_debug_ops); - - snprintf(name, sizeof(name), "%s-bfloat-failed", - bch2_btree_id_str(bd->id)); - - debugfs_create_file(name, 0400, c->btree_debug_dir, bd, - &bfloat_failed_debug_ops); + bch2_fs_debug_btree_init(c, bd); } } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 082075244e16..556a217108d3 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -131,29 +131,33 @@ fsck_err: void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned i, nr_data = s->nr_blocks - s->nr_redundant; + const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; + struct bch_stripe s = {}; + + memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); + + unsigned nr_data = s.nr_blocks - s.nr_redundant; + + prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", + s.algorithm, + le16_to_cpu(s.sectors), + nr_data, + s.nr_redundant); + bch2_prt_csum_type(out, s.csum_type); + prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); + + for (unsigned i = 0; i < s.nr_blocks; i++) { + const struct bch_extent_ptr *ptr = sp->ptrs + i; + + if ((void *) ptr >= bkey_val_end(k)) + break; + + bch2_extent_ptr_to_text(out, c, ptr); - prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", - s->algorithm, - le16_to_cpu(s->sectors), - nr_data, - s->nr_redundant, - s->csum_type, - 1U << s->csum_granularity_bits); - - for (i = 0; i < s->nr_blocks; i++) { - const struct bch_extent_ptr *ptr = s->ptrs + i; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - - prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); - if (i < nr_data) - prt_printf(out, "#%u", stripe_blockcount_get(s, i)); - prt_printf(out, " gen %u", ptr->gen); - if (ptr_stale(ca, ptr)) - prt_printf(out, " stale"); + if (s.csum_type < BCH_CSUM_NR && + i < nr_data && + stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) + prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); } } @@ -607,10 +611,8 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct printbuf err = PRINTBUF; struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); - prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n", - want.hi, want.lo, - got.hi, got.lo, - bch2_csum_types[v->csum_type]); + prt_str(&err, "stripe "); + bch2_csum_err_msg(&err, v->csum_type, want, got); prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); bch_err_ratelimited(ca, "%s", err.buf); diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index f4369b02e805..f042616888b0 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -32,6 +32,8 @@ static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) static inline unsigned stripe_csum_offset(const struct bch_stripe *s, unsigned dev, unsigned csum_idx) { + EBUG_ON(s->csum_type >= BCH_CSUM_NR); + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; return sizeof(struct bch_stripe) + diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 0e3ca99fbd2d..1a331e539204 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -998,7 +998,9 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_str(out, " cached"); if (ptr->unwritten) prt_str(out, " unwritten"); - if (ca && ptr_stale(ca, ptr)) + if (b >= ca->mi.first_bucket && + b < ca->mi.nbuckets && + ptr_stale(ca, ptr)) prt_printf(out, " stale"); } } @@ -1028,11 +1030,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", crc.compressed_size, crc.uncompressed_size, - crc.offset, crc.nonce, - bch2_csum_types[crc.csum_type]); + crc.offset, crc.nonce); + bch2_prt_csum_type(out, crc.csum_type); + prt_str(out, " compress "); bch2_prt_compression_type(out, crc.compression_type); break; } diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 4ce5e957a6e9..0f955c3c76a7 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -115,7 +115,7 @@ static void swap_bytes(void *a, void *b, size_t n) struct wrapper { cmp_func_t cmp; - swap_func_t swap; + swap_func_t swap_func; }; /* @@ -125,7 +125,7 @@ struct wrapper { static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) { if (swap_func == SWAP_WRAPPER) { - ((const struct wrapper *)priv)->swap(a, b, (int)size); + ((const struct wrapper *)priv)->swap_func(a, b, (int)size); return; } @@ -174,7 +174,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, int i, c, r; /* called from 'sort' without swap function, let's pick the default */ - if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) swap_func = NULL; if (!swap_func) { @@ -227,7 +227,7 @@ void eytzinger0_sort(void *base, size_t n, size_t size, { struct wrapper w = { .cmp = cmp_func, - .swap = swap_func, + .swap_func = swap_func, }; return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index ee0e2df33322..24840aee335c 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -242,8 +242,8 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) = eytzinger0_next((_i), (_size))) /* return greatest node <= @search, or -1 if not found */ -static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) +static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) { unsigned i, n = 0; @@ -256,18 +256,32 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, } while (n < nr); if (n & 1) { - /* @i was greater than @search, return previous node: */ + /* + * @i was greater than @search, return previous node: + * + * if @i was leftmost/smallest element, + * eytzinger0_prev(eytzinger0_first())) returns -1, as expected + */ return eytzinger0_prev(i, nr); } else { return i; } } -static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) +static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) { ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); - return eytzinger0_next(idx, size); + + /* + * if eytitzinger0_find_le() returned -1 - no element was <= search - we + * want to return the first element; next/prev identities mean this work + * as expected + * + * similarly if find_le() returns last element, we should return -1; + * identities mean this all works out: + */ + return eytzinger0_next(idx, nr); } #define eytzinger0_find(base, nr, size, _cmp, search) \ diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index f49e6c0f0f68..b889370a5088 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -387,6 +387,8 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); + bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write); + /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); @@ -590,22 +592,25 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) prefetch(&inode->ei_inode); prefetch((void *) &inode->ei_inode + 64); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write)) + return -EROFS; + inode_lock(&inode->v); ret = generic_write_checks(req, iter); if (unlikely(ret <= 0)) - goto err; + goto err_put_write_ref; ret = file_remove_privs(file); if (unlikely(ret)) - goto err; + goto err_put_write_ref; ret = file_update_time(file); if (unlikely(ret)) - goto err; + goto err_put_write_ref; if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) - goto err; + goto err_put_write_ref; inode_dio_begin(&inode->v); bch2_pagecache_block_get(inode); @@ -645,7 +650,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) } ret = bch2_dio_write_loop(dio); -err: +out: if (locked) inode_unlock(&inode->v); return ret; @@ -653,7 +658,9 @@ err_put_bio: bch2_pagecache_block_put(inode); bio_put(bio); inode_dio_end(&inode->v); - goto err; +err_put_write_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + goto out; } void bch2_fs_fs_io_direct_exit(struct bch_fs *c) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 8c70123b6a0c..20b40477425f 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -174,18 +174,18 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, static int bch2_flush_inode(struct bch_fs *c, struct bch_inode_info *inode) { - struct bch_inode_unpacked u; - int ret; - if (c->opts.journal_flush_disabled) return 0; - ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); - if (ret) - return ret; + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) + return -EROFS; - return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: - bch2_inode_flush_nocow_writes(c, inode); + struct bch_inode_unpacked u; + int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: + bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: + bch2_inode_flush_nocow_writes(c, inode); + bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + return ret; } int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 725fcf46f631..9aa28b52ab92 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -247,7 +247,7 @@ static void journal_entry_err_msg(struct printbuf *out, if (entry) { prt_str(out, " type="); - prt_str(out, bch2_jset_entry_types[entry->type]); + bch2_prt_jset_entry_type(out, entry->type); } if (!jset) { @@ -403,7 +403,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs jset_entry_for_each_key(entry, k) { if (!first) { prt_newline(out); - prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + bch2_prt_jset_entry_type(out, entry->type); + prt_str(out, ": "); } prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); @@ -563,9 +564,9 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); - prt_printf(out, "type=%s v=%llu", - bch2_fs_usage_types[u->entry.btree_id], - le64_to_cpu(u->v)); + prt_str(out, "type="); + bch2_prt_fs_usage_type(out, u->entry.btree_id); + prt_printf(out, " v=%llu", le64_to_cpu(u->v)); } static int journal_entry_data_usage_validate(struct bch_fs *c, @@ -827,11 +828,11 @@ int bch2_journal_entry_validate(struct bch_fs *c, void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { + bch2_prt_jset_entry_type(out, entry->type); + if (entry->type < BCH_JSET_ENTRY_NR) { - prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + prt_str(out, ": "); bch2_jset_entry_ops[entry->type].to_text(out, c, entry); - } else { - prt_printf(out, "(unknown type %u)", entry->type); } } diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ab811c0dad26..04a577848b01 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -67,6 +67,8 @@ void bch2_journal_set_watermark(struct journal *j) track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); + mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin); + swap(watermark, j->watermark); if (watermark > j->watermark) journal_wake(j); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 8c053cb64ca5..b5161b5d76a0 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -134,6 +134,7 @@ enum journal_flags { JOURNAL_STARTED, JOURNAL_MAY_SKIP_FLUSH, JOURNAL_NEED_FLUSH_WRITE, + JOURNAL_SPACE_LOW, }; /* Reasons we may fail to get a journal reservation: */ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index e1800c4119b5..bb068fd72465 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -43,7 +43,7 @@ const char * const __bch2_btree_ids[] = { NULL }; -const char * const bch2_csum_types[] = { +static const char * const __bch2_csum_types[] = { BCH_CSUM_TYPES() NULL }; @@ -53,7 +53,7 @@ const char * const bch2_csum_opts[] = { NULL }; -const char * const __bch2_compression_types[] = { +static const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; @@ -83,18 +83,39 @@ const char * const bch2_member_states[] = { NULL }; -const char * const bch2_jset_entry_types[] = { +static const char * const __bch2_jset_entry_types[] = { BCH_JSET_ENTRY_TYPES() NULL }; -const char * const bch2_fs_usage_types[] = { +static const char * const __bch2_fs_usage_types[] = { BCH_FS_USAGE_TYPES() NULL }; #undef x +static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], + unsigned nr, const char *type, unsigned idx) +{ + if (idx < nr) + prt_str(out, opts[idx]); + else + prt_printf(out, "(unknown %s %u)", type, idx); +} + +#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \ +void bch2_prt_##name(struct printbuf *out, type t) \ +{ \ + prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\ +} + +PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); +PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); +PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); +PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); +PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); + static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) { diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 1ac4135cca1c..84e452835a17 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -16,18 +16,20 @@ extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; -extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; -extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; -extern const char * const bch2_jset_entry_types[]; -extern const char * const bch2_fs_usage_types[]; extern const char * const bch2_d_types[]; +void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); +void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); +void bch2_prt_data_type(struct printbuf *, enum bch_data_type); +void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); +void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); + static inline const char *bch2_d_type_str(unsigned d_type) { return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b76c16152579..0f328aba9760 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -47,20 +47,6 @@ void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) } } -static bool btree_id_is_alloc(enum btree_id id) -{ - switch (id) { - case BTREE_ID_alloc: - case BTREE_ID_backpointers: - case BTREE_ID_need_discard: - case BTREE_ID_freespace: - case BTREE_ID_bucket_gens: - return true; - default: - return false; - } -} - /* for -o reconstruct_alloc: */ static void bch2_reconstruct_alloc(struct bch_fs *c) { diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index cb501460d615..0cec0f7d9703 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -44,7 +44,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || c->opts.fsck || !c->sb.clean) + if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit) return bch2_fs_read_write_early(c); return 0; } diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index d6f81179c3a2..a98ef940b7a3 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -51,7 +51,10 @@ BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ x(btree_subvolume_children, \ BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ - BCH_FSCK_ERR_subvol_children_not_set) + BCH_FSCK_ERR_subvol_children_not_set) \ + x(mi_btree_bitmap, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_btree_bitmap_not_marked) #define DOWNGRADE_TABLE() diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index d7d609131030..4ca6e7b0d8aa 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -130,7 +130,7 @@ x(bucket_gens_nonzero_for_invalid_buckets, 122) \ x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ x(need_discard_freespace_key_bad, 124) \ - x(backpointer_pos_wrong, 125) \ + x(backpointer_bucket_offset_wrong, 125) \ x(backpointer_to_missing_device, 126) \ x(backpointer_to_missing_alloc, 127) \ x(backpointer_to_missing_ptr, 128) \ @@ -270,7 +270,8 @@ x(btree_ptr_v2_min_key_bad, 262) \ x(btree_root_unreadable_and_scan_found_nothing, 263) \ x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) + x(dup_backpointer_to_bad_csum_extent, 265) \ + x(btree_bitmap_not_marked, 266) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index eff5ce18c69c..522a969345e5 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "disk_groups.h" #include "opts.h" #include "replicas.h" @@ -426,3 +427,55 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bch2_write_super(c); mutex_unlock(&c->sb_lock); } + +/* + * Per member "range has btree nodes" bitmap: + * + * This is so that if we ever have to run the btree node scan to repair we don't + * have to scan full devices: + */ + +bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) +{ + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) + if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev), + ptr->offset, btree_sectors(c))) + return false; + return true; +} + +static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, + u64 start, unsigned sectors) +{ + struct bch_member *m = __bch2_members_v2_get_mut(mi, dev); + u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap); + + u64 end = start + sectors; + + int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6); + if (resize > 0) { + u64 new_bitmap = 0; + + for (unsigned i = 0; i < 64; i++) + if (bitmap & BIT_ULL(i)) + new_bitmap |= BIT_ULL(i >> resize); + bitmap = new_bitmap; + m->btree_bitmap_shift += resize; + } + + for (unsigned bit = sectors >> m->btree_bitmap_shift; + bit << m->btree_bitmap_shift < end; + bit++) + bitmap |= BIT_ULL(bit); + + m->btree_allocated_bitmap = cpu_to_le64(bitmap); +} + +void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) + __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index be0a94183271..b27c3e4467cf 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -3,6 +3,7 @@ #define _BCACHEFS_SB_MEMBERS_H #include "darray.h" +#include "bkey_types.h" extern char * const bch2_member_error_strs[]; @@ -220,6 +221,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), .valid = bch2_member_exists(mi), + .btree_bitmap_shift = mi->btree_bitmap_shift, + .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), }; } @@ -228,4 +231,22 @@ void bch2_sb_members_from_cpu(struct bch_fs *); void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); void bch2_dev_errors_reset(struct bch_dev *); +static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors) +{ + u64 end = start + sectors; + + if (end > 64 << ca->mi.btree_bitmap_shift) + return false; + + for (unsigned bit = sectors >> ca->mi.btree_bitmap_shift; + bit << ca->mi.btree_bitmap_shift < end; + bit++) + if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit))) + return false; + return true; +} + +bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); +void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); + #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 0e806f04f3d7..544322d5c251 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -125,6 +125,15 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances return s->parent; } +static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) +{ + const struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return false; + + return test_bit(ancestor - id - 1, s->is_ancestor); +} + bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { bool ret; @@ -140,13 +149,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) while (id && id < ancestor - IS_ANCESTOR_BITMAP) id = get_ancestor_below(t, id, ancestor); - if (id && id < ancestor) { - ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); + ret = id && id < ancestor + ? test_ancestor_bitmap(t, id, ancestor) + : id == ancestor; - EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); - } else { - ret = id == ancestor; - } + EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); out: rcu_read_unlock(); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 5eee055ee272..08ea3dbbbe97 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -700,8 +700,11 @@ retry: return -ENOMEM; sb->sb_name = kstrdup(path, GFP_KERNEL); - if (!sb->sb_name) - return -ENOMEM; + if (!sb->sb_name) { + ret = -ENOMEM; + prt_printf(&err, "error allocating memory for sb_name"); + goto err; + } #ifndef __KERNEL__ if (opt_get(*opts, direct_io) == false) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index ed63018f21be..8daf80a38d60 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -288,8 +288,13 @@ static void __bch2_fs_read_only(struct bch_fs *c) if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && !test_bit(BCH_FS_emergency_ro, &c->flags)) set_bit(BCH_FS_clean_shutdown, &c->flags); + bch2_fs_journal_stop(&c->journal); + bch_info(c, "%sshutdown complete, journal seq %llu", + test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", + c->journal.seq_ondisk); + /* * After stopping journal: */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index ec784d975f66..11bcef170c2c 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -37,6 +37,8 @@ struct bch_member_cpu { u8 durability; u8 freespace_initialized; u8 valid; + u8 btree_bitmap_shift; + u64 btree_allocated_bitmap; }; #endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index c86a93a8d8fc..5be92fe3f4ea 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -17,7 +17,6 @@ #include "btree_iter.h" #include "btree_key_cache.h" #include "btree_update.h" -#include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" #include "clock.h" @@ -26,6 +25,7 @@ #include "ec.h" #include "inode.h" #include "journal.h" +#include "journal_reclaim.h" #include "keylist.h" #include "move.h" #include "movinggc.h" @@ -139,6 +139,7 @@ do { \ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); +write_attribute(trigger_journal_flush); write_attribute(prune_cache); write_attribute(btree_wakeup); rw_attribute(btree_gc_periodic); @@ -166,7 +167,6 @@ read_attribute(btree_write_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); read_attribute(journal_debug); -read_attribute(btree_updates); read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(stripes_heap); @@ -415,9 +415,6 @@ SHOW(bch2_fs) if (attr == &sysfs_journal_debug) bch2_journal_debug_to_text(out, &c->journal); - if (attr == &sysfs_btree_updates) - bch2_btree_updates_to_text(out, c); - if (attr == &sysfs_btree_cache) bch2_btree_cache_to_text(out, c); @@ -505,7 +502,7 @@ STORE(bch2_fs) /* Debugging: */ - if (!test_bit(BCH_FS_rw, &c->flags)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) return -EROFS; if (attr == &sysfs_prune_cache) { @@ -538,6 +535,11 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_invalidates) bch2_do_invalidates(c); + if (attr == &sysfs_trigger_journal_flush) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); + } + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -558,6 +560,7 @@ STORE(bch2_fs) size = ret; } #endif + bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); return size; } SYSFS_OPS(bch2_fs); @@ -639,7 +642,6 @@ SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { &sysfs_flags, &sysfs_journal_debug, - &sysfs_btree_updates, &sysfs_btree_cache, &sysfs_btree_key_cache, &sysfs_new_stripes, @@ -657,6 +659,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_gc, &sysfs_trigger_discards, &sysfs_trigger_invalidates, + &sysfs_trigger_journal_flush, &sysfs_prune_cache, &sysfs_btree_wakeup, diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index b3fe9fc57747..bfec656f94c0 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b7e7c29278fc..5cf885b09986 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -788,6 +788,14 @@ static inline int copy_from_user_errcode(void *to, const void __user *from, unsi #endif +static inline void mod_bit(long nr, volatile unsigned long *addr, bool v) +{ + if (v) + set_bit(nr, addr); + else + clear_bit(nr, addr); +} + static inline void __set_bit_le64(size_t bit, __le64 *addr) { addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); @@ -795,7 +803,7 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr) static inline void __clear_bit_le64(size_t bit, __le64 *addr) { - addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64)); + addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64)); } static inline bool test_bit_le64(size_t bit, __le64 *addr) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index dd6f566a383f..121ab890bd05 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1133,6 +1133,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_record_root_in_trans(trans, node->root); + if (ret) + return ret; ret = btrfs_update_delayed_inode(trans, node->root, path, node); return ret; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index beedd6ed64d3..257d044bca91 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3464,6 +3464,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root_id != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_ref generic_ref = { 0 }; + /* + * Assert that the extent buffer is not cleared due to + * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer + * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for + * detail. + */ + ASSERT(btrfs_header_bytenr(buf) != 0); + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, buf->start, buf->len, parent, btrfs_header_owner(buf)); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 61594eaf1f89..2776112dbdf8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -681,31 +681,21 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, gfp_t extra_gfp) { + const gfp_t gfp = GFP_NOFS | extra_gfp; unsigned int allocated; for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp, - nr_pages, page_array); - - if (allocated == nr_pages) - return 0; - - /* - * During this iteration, no page could be allocated, even - * though alloc_pages_bulk_array() falls back to alloc_page() - * if it could not bulk-allocate. So we must be out of memory. - */ - if (allocated == last) { + allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array); + if (unlikely(allocated == last)) { + /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) { __free_page(page_array[i]); page_array[i] = NULL; } return -ENOMEM; } - - memalloc_retry_wait(GFP_NOFS); } return 0; } @@ -4154,7 +4144,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, * The actual zeroout of the buffer will happen later in * btree_csum_one_bio. */ - if (btrfs_is_zoned(fs_info)) { + if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); return; } @@ -4193,6 +4183,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { bool subpage = eb->fs_info->nodesize < PAGE_SIZE; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 37701531eeb1..c65fe5de4022 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2533,7 +2533,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, */ if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) - btrfs_delalloc_release_metadata(inode, len, false); + btrfs_delalloc_release_metadata(inode, len, true); /* For sanity tests. */ if (btrfs_is_testing(fs_info)) @@ -4503,6 +4503,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; u64 root_flags; + u64 qgroup_reserved = 0; int ret; down_write(&fs_info->subvol_sem); @@ -4547,12 +4548,20 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) goto out_undead; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release; } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -4611,7 +4620,9 @@ out_end_trans: ret = btrfs_end_transaction(trans); inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(root, &block_rsv); + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_undead: if (ret) { spin_lock(&dest->root_item_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 294e31edec9d..55f3ba6a831c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -613,6 +613,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, int ret; dev_t anon_dev; u64 objectid; + u64 qgroup_reserved = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -650,13 +651,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap, trans_num_items, false); if (ret) goto out_new_inode_args; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(root, &block_rsv); - goto out_new_inode_args; + goto out_release_rsv; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) + goto out; + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; /* Tree log can't currently deal with an inode which is a new root. */ @@ -767,9 +773,11 @@ static noinline int create_subvol(struct mnt_idmap *idmap, out: trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(root, &block_rsv); - btrfs_end_transaction(trans); +out_release_rsv: + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: @@ -791,6 +799,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_pending_snapshot *pending_snapshot; unsigned int trans_num_items; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv *block_rsv; + u64 qgroup_reserved = 0; int ret; /* We do not support snapshotting right now. */ @@ -827,19 +837,19 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto free_pending; } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, - BTRFS_BLOCK_RSV_TEMP); + block_rsv = &pending_snapshot->block_rsv; + btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * 1 to add dir item * 1 to add dir index * 1 to update parent inode item */ trans_num_items = create_subvol_num_items(inherit) + 3; - ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv, trans_num_items, false); if (ret) goto free_pending; + qgroup_reserved = block_rsv->qgroup_rsv_reserved; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -852,6 +862,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = PTR_ERR(trans); goto fail; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) { + btrfs_end_transaction(trans); + goto fail; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->pending_snapshot = pending_snapshot; @@ -881,7 +898,9 @@ fail: if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); + btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5f90f0605b12..cf8820ce7aa2 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4495,6 +4495,8 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); + if (!sb_rdonly(fs_info->sb)) + add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } /* diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 4bb538a372ce..7007f9e0c972 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -548,13 +548,3 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, } return ret; } - -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - u64 qgroup_to_release; - - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); - btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); -} diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index 6f929cf3bd49..8f5739e732b9 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -18,8 +18,6 @@ struct btrfs_trans_handle; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const struct fscrypt_str *name); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 46e8426adf4f..85f359e0e0a7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -745,14 +745,6 @@ again: h->reloc_reserved = reloc_reserved; } - /* - * Now that we have found a transaction to be a part of, convert the - * qgroup reservation from prealloc to pertrans. A different transaction - * can't race in and free our pertrans out from under us. - */ - if (qgroup_reserved) - btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); - got_it: if (!current->journal_info) current->journal_info = h; @@ -786,8 +778,15 @@ got_it: * not just freed. */ btrfs_end_transaction(h); - return ERR_PTR(ret); + goto reserve_fail; } + /* + * Now that we have found a transaction to be a part of, convert the + * qgroup reservation from prealloc to pertrans. A different transaction + * can't race in and free our pertrans out from under us. + */ + if (qgroup_reserved) + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); return h; @@ -1495,6 +1494,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + btrfs_qgroup_free_meta_all_pertrans(root); spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); @@ -1519,7 +1519,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) if (ret2) return ret2; spin_lock(&fs_info->fs_roots_radix_lock); - btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1340d77124ae..ee9caf7916fb 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -795,8 +795,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) ihold(inode); if (wbc->sync_mode == WB_SYNC_NONE && - ceph_inode_to_fs_client(inode)->write_congested) + ceph_inode_to_fs_client(inode)->write_congested) { + redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; + } wait_on_page_fscache(page); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 55051ad09c19..c4941ba245ac 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4783,13 +4783,13 @@ int ceph_drop_caps_for_unlink(struct inode *inode) doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) list_del_init(&ci->i_cap_delay_list); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_unlink_delay_list); - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); /* * Fire the work immediately, because the MDS maybe diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3ab9c268a8bb..360b686c3c67 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2504,7 +2504,7 @@ static void ceph_cap_unlink_work(struct work_struct *work) struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "begin\n"); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); while (!list_empty(&mdsc->cap_unlink_delay_list)) { struct ceph_inode_info *ci; struct inode *inode; @@ -2516,15 +2516,15 @@ static void ceph_cap_unlink_work(struct work_struct *work) inode = igrab(&ci->netfs.inode); if (inode) { - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "on %p %llx.%llx\n", inode, ceph_vinop(inode)); ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); } } - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); } @@ -5404,7 +5404,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); - spin_lock_init(&mdsc->cap_unlink_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 03f8ff00874f..b88e80415224 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -461,9 +461,8 @@ struct ceph_mds_client { struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ - spinlock_t cap_delay_lock; /* protects cap_delay_list */ struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */ - spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */ + spinlock_t cap_delay_lock; /* protects cap_delay_list and cap_unlink_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b6cad106c37e..0b2da7b7e2ad 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -310,6 +310,10 @@ struct cuse_init_args { /** * cuse_process_init_reply - finish initializing CUSE channel * + * @fm: The fuse mount information containing the CUSE connection. + * @args: The arguments passed to the init reply. + * @error: The error code signifying if any error occurred during the process. + * * This function creates the character device and sets up all the * required data structures for it. Please read the comment at the * top of this file for high level overview. diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 4a6df591add6..2b0d4781f394 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1321,6 +1321,7 @@ retry: err = fuse_do_statx(inode, file, stat); if (err == -ENOSYS) { fc->no_statx = 1; + err = 0; goto retry; } } else { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a56e7bffd000..b57ce4157640 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1362,7 +1362,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, bool *exclusive) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_file *ff = iocb->ki_filp->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); if (*exclusive) { @@ -1377,7 +1377,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, * have raced, so check it again. */ if (fuse_io_past_eof(iocb, from) || - fuse_file_uncached_io_start(inode, ff, NULL) != 0) { + fuse_inode_uncached_io_start(fi, NULL) != 0) { inode_unlock_shared(inode); inode_lock(inode); *exclusive = true; @@ -1388,13 +1388,13 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_file *ff = iocb->ki_filp->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); if (exclusive) { inode_unlock(inode); } else { /* Allow opens in caching mode after last parallel dio end */ - fuse_file_uncached_io_end(inode, ff); + fuse_inode_uncached_io_end(fi); inode_unlock_shared(inode); } } @@ -2574,8 +2574,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) * First mmap of direct_io file enters caching inode io mode. * Also waits for parallel dio writers to go into serial mode * (exclusive instead of shared lock). + * After first mmap, the inode stays in caching io mode until + * the direct_io file release. */ - rc = fuse_file_cached_io_start(inode, ff); + rc = fuse_file_cached_io_open(inode, ff); if (rc) return rc; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b24084b60864..f23919610313 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1394,9 +1394,10 @@ int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* iomode.c */ -int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff); -int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb); -void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff); +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff); +int fuse_inode_uncached_io_start(struct fuse_inode *fi, + struct fuse_backing *fb); +void fuse_inode_uncached_io_end(struct fuse_inode *fi); int fuse_file_io_open(struct file *file, struct inode *inode); void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3a5d88878335..99e44ea7d875 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -175,6 +175,7 @@ static void fuse_evict_inode(struct inode *inode) } } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { + WARN_ON(fi->iocachectr != 0); WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); } diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index c653ddcf0578..c99e285f3183 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -21,12 +21,13 @@ static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) } /* - * Start cached io mode. + * Called on cached file open() and on first mmap() of direct_io file. + * Takes cached_io inode mode reference to be dropped on file release. * * Blocks new parallel dio writes and waits for the in-progress parallel dio * writes to complete. */ -int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -67,10 +68,9 @@ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) return 0; } -static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) +static void fuse_file_cached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) { - struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fi->lock); WARN_ON(fi->iocachectr <= 0); WARN_ON(ff->iomode != IOM_CACHED); @@ -82,16 +82,15 @@ static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) } /* Start strictly uncached io mode where cache access is not allowed */ -int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb) +int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) { - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_backing *oldfb; int err = 0; spin_lock(&fi->lock); /* deny conflicting backing files on same fuse inode */ oldfb = fuse_inode_backing(fi); - if (oldfb && oldfb != fb) { + if (fb && oldfb && oldfb != fb) { err = -EBUSY; goto unlock; } @@ -99,12 +98,10 @@ int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struc err = -ETXTBSY; goto unlock; } - WARN_ON(ff->iomode != IOM_NONE); fi->iocachectr--; - ff->iomode = IOM_UNCACHED; /* fuse inode holds a single refcount of backing file */ - if (!oldfb) { + if (fb && !oldfb) { oldfb = fuse_inode_backing_set(fi, fb); WARN_ON_ONCE(oldfb != NULL); } else { @@ -115,15 +112,29 @@ unlock: return err; } -void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) +/* Takes uncached_io inode mode reference to be dropped on file release */ +static int fuse_file_uncached_io_open(struct inode *inode, + struct fuse_file *ff, + struct fuse_backing *fb) { struct fuse_inode *fi = get_fuse_inode(inode); + int err; + + err = fuse_inode_uncached_io_start(fi, fb); + if (err) + return err; + + WARN_ON(ff->iomode != IOM_NONE); + ff->iomode = IOM_UNCACHED; + return 0; +} + +void fuse_inode_uncached_io_end(struct fuse_inode *fi) +{ struct fuse_backing *oldfb = NULL; spin_lock(&fi->lock); WARN_ON(fi->iocachectr >= 0); - WARN_ON(ff->iomode != IOM_UNCACHED); - ff->iomode = IOM_NONE; fi->iocachectr++; if (!fi->iocachectr) { wake_up(&fi->direct_io_waitq); @@ -134,6 +145,15 @@ void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) fuse_backing_put(oldfb); } +/* Drop uncached_io reference from passthrough open */ +static void fuse_file_uncached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) +{ + WARN_ON(ff->iomode != IOM_UNCACHED); + ff->iomode = IOM_NONE; + fuse_inode_uncached_io_end(fi); +} + /* * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write @@ -163,7 +183,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file) return PTR_ERR(fb); /* First passthrough file open denies caching inode io mode */ - err = fuse_file_uncached_io_start(inode, ff, fb); + err = fuse_file_uncached_io_open(inode, ff, fb); if (!err) return 0; @@ -216,7 +236,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode) if (ff->open_flags & FOPEN_PASSTHROUGH) err = fuse_file_passthrough_open(inode, file); else - err = fuse_file_cached_io_start(inode, ff); + err = fuse_file_cached_io_open(inode, ff); if (err) goto fail; @@ -236,8 +256,10 @@ fail: /* No more pending io and no new io possible to inode via open/mmapped file */ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + /* - * Last parallel dio close allows caching inode io mode. + * Last passthrough file close allows caching inode io mode. * Last caching file close exits caching inode io mode. */ switch (ff->iomode) { @@ -245,10 +267,10 @@ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) /* Nothing to do */ break; case IOM_UNCACHED: - fuse_file_uncached_io_end(inode, ff); + fuse_file_uncached_io_release(ff, fi); break; case IOM_CACHED: - fuse_file_cached_io_end(inode, ff); + fuse_file_cached_io_release(ff, fi); break; } } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e9df2f87072c..8502ef68459b 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -636,11 +636,18 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * - * Both paths of the branch look the same. They're supposed to + * For similar reasons, writable and readonly files are given different + * lockdep key, because the writable file /sys/power/resume may call vfs + * lookup helpers for arbitrary paths and readonly files can be read by + * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs. + * + * All three cases look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); + else if (file->f_mode & FMODE_WRITE) + mutex_init(&of->mutex); else mutex_init(&of->mutex); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index fac938f563ad..1955481832e0 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3490,11 +3490,13 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct dentry *dentry, const u32 *bmval, int ignore_crossmnt) { + DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)); struct nfsd4_fattr_args args; struct svc_fh *tempfh = NULL; int starting_len = xdr->buf->len; __be32 *attrlen_p, status; int attrlen_offset; + u32 attrmask[3]; int err; struct nfsd4_compoundres *resp = rqstp->rq_resp; u32 minorversion = resp->cstate.minorversion; @@ -3502,10 +3504,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, .mnt = exp->ex_path.mnt, .dentry = dentry, }; - union { - u32 attrmask[3]; - unsigned long mask[2]; - } u; unsigned long bit; bool file_modified = false; u64 size = 0; @@ -3521,20 +3519,19 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, /* * Make a local copy of the attribute bitmap that can be modified. */ - memset(&u, 0, sizeof(u)); - u.attrmask[0] = bmval[0]; - u.attrmask[1] = bmval[1]; - u.attrmask[2] = bmval[2]; + attrmask[0] = bmval[0]; + attrmask[1] = bmval[1]; + attrmask[2] = bmval[2]; args.rdattr_err = 0; if (exp->ex_fslocs.migrated) { - status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1], - &u.attrmask[2], &args.rdattr_err); + status = fattr_handle_absent_fs(&attrmask[0], &attrmask[1], + &attrmask[2], &args.rdattr_err); if (status) goto out; } args.size = 0; - if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { + if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry), &file_modified, &size); if (status) @@ -3553,16 +3550,16 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, if (!(args.stat.result_mask & STATX_BTIME)) /* underlying FS does not offer btime so we can't share it */ - u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE; - if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE; + if ((attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || - (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | + (attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { err = vfs_statfs(&path, &args.statfs); if (err) goto out_nfserr; } - if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && + if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); status = nfserr_jukebox; @@ -3577,10 +3574,10 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, args.fhp = fhp; args.acl = NULL; - if (u.attrmask[0] & FATTR4_WORD0_ACL) { + if (attrmask[0] & FATTR4_WORD0_ACL) { err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl); if (err == -EOPNOTSUPP) - u.attrmask[0] &= ~FATTR4_WORD0_ACL; + attrmask[0] &= ~FATTR4_WORD0_ACL; else if (err == -EINVAL) { status = nfserr_attrnotsupp; goto out; @@ -3592,17 +3589,17 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL args.context = NULL; - if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || - u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { + if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || + attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { if (exp->ex_flags & NFSEXP_SECURITY_LABEL) err = security_inode_getsecctx(d_inode(dentry), &args.context, &args.contextlen); else err = -EOPNOTSUPP; args.contextsupport = (err == 0); - if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) { + if (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) { if (err == -EOPNOTSUPP) - u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; else if (err) goto out_nfserr; } @@ -3610,8 +3607,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ /* attrmask */ - status = nfsd4_encode_bitmap4(xdr, u.attrmask[0], - u.attrmask[1], u.attrmask[2]); + status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1], + attrmask[2]); if (status) goto out; @@ -3620,7 +3617,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, attrlen_p = xdr_reserve_space(xdr, XDR_UNIT); if (!attrlen_p) goto out_resource; - for_each_set_bit(bit, (const unsigned long *)&u.mask, + bitmap_from_arr32(attr_bitmap, attrmask, + ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)); + for_each_set_bit(bit, attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) { status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args); if (status != nfs_ok) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index bc846b904b68..aee40db7a036 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -240,7 +240,7 @@ nilfs_filetype_table[NILFS_FT_MAX] = { #define S_SHIFT 12 static unsigned char -nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { +nilfs_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR, [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV, diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 902b326e1e56..87dcaae32ff8 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) break; dst += ret; } - if (ret >= 0 && boot_command_line[0]) { - ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", - boot_command_line); - if (ret > 0) - dst += ret; - } + } + if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) { + ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", + boot_command_line); + if (ret > 0) + dst += ret; } out: kfree(key); diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 13a9d7acf8f8..0ff2491c311d 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -433,8 +433,8 @@ smb2_close_cached_fid(struct kref *ref) if (cfid->is_open) { rc = SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, cfid->fid.volatile_fid); - if (rc != -EBUSY && rc != -EAGAIN) - atomic_dec(&cfid->tcon->num_remote_opens); + if (rc) /* should we retry on -EBUSY or -EAGAIN? */ + cifs_dbg(VFS, "close cached dir rc %d\n", rc); } free_cached_dir(cfid); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index f6a302205f89..d6669ce4ae87 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1077,6 +1077,7 @@ struct cifs_ses { and after mount option parsing we fill it */ char *domainName; char *password; + char *password2; /* When key rotation used, new password may be set before it expires */ char workstation_name[CIFS_MAX_WORKSTATION_LEN]; struct session_key auth_key; struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 85679ae106fd..4e35970681bf 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2183,6 +2183,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) } ++delim; + /* BB consider adding support for password2 (Key Rotation) for multiuser in future */ ctx->password = kstrndup(delim, len, GFP_KERNEL); if (!ctx->password) { cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n", @@ -2206,6 +2207,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) kfree(ctx->username); ctx->username = NULL; kfree_sensitive(ctx->password); + /* no need to free ctx->password2 since not allocated in this path */ ctx->password = NULL; goto out_key_put; } @@ -2317,6 +2319,12 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) if (!ses->password) goto get_ses_fail; } + /* ctx->password freed at unmount */ + if (ctx->password2) { + ses->password2 = kstrdup(ctx->password2, GFP_KERNEL); + if (!ses->password2) + goto get_ses_fail; + } if (ctx->domainname) { ses->domainName = kstrdup(ctx->domainname, GFP_KERNEL); if (!ses->domainName) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index b7bfe705b2c4..6c727d8c31e8 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -162,6 +162,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_string("username", Opt_user), fsparam_string("pass", Opt_pass), fsparam_string("password", Opt_pass), + fsparam_string("password2", Opt_pass2), fsparam_string("ip", Opt_ip), fsparam_string("addr", Opt_ip), fsparam_string("domain", Opt_domain), @@ -345,6 +346,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx new_ctx->nodename = NULL; new_ctx->username = NULL; new_ctx->password = NULL; + new_ctx->password2 = NULL; new_ctx->server_hostname = NULL; new_ctx->domainname = NULL; new_ctx->UNC = NULL; @@ -357,6 +359,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(prepath); DUP_CTX_STR(username); DUP_CTX_STR(password); + DUP_CTX_STR(password2); DUP_CTX_STR(server_hostname); DUP_CTX_STR(UNC); DUP_CTX_STR(source); @@ -905,6 +908,8 @@ static int smb3_reconfigure(struct fs_context *fc) else { kfree_sensitive(ses->password); ses->password = kstrdup(ctx->password, GFP_KERNEL); + kfree_sensitive(ses->password2); + ses->password2 = kstrdup(ctx->password2, GFP_KERNEL); } STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); @@ -1305,6 +1310,18 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; } break; + case Opt_pass2: + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; + if (strlen(param->string) == 0) + break; + + ctx->password2 = kstrdup(param->string, GFP_KERNEL); + if (ctx->password2 == NULL) { + cifs_errorf(fc, "OOM when copying password2 string\n"); + goto cifs_parse_mount_err; + } + break; case Opt_ip: if (strlen(param->string) == 0) { ctx->got_ip = false; @@ -1608,6 +1625,8 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_parse_mount_err: kfree_sensitive(ctx->password); ctx->password = NULL; + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; return -EINVAL; } @@ -1713,6 +1732,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) ctx->username = NULL; kfree_sensitive(ctx->password); ctx->password = NULL; + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; kfree(ctx->server_hostname); ctx->server_hostname = NULL; kfree(ctx->UNC); diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 8a35645e0b65..a947bddeba27 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -145,6 +145,7 @@ enum cifs_param { Opt_source, Opt_user, Opt_pass, + Opt_pass2, Opt_ip, Opt_domain, Opt_srcaddr, @@ -177,6 +178,7 @@ struct smb3_fs_context { char *username; char *password; + char *password2; char *domainname; char *source; char *server_hostname; diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 91b07ef9e25c..60afab5c83d4 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1105,7 +1105,8 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, } else { cifs_open_info_to_fattr(fattr, data, sb); } - if (!rc && fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) + if (!rc && *inode && + (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING)) cifs_mark_open_handles_for_deleted_file(*inode, full_path); break; case -EREMOTE: diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 33ac4f8f5050..7d15a1969b81 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -98,6 +98,7 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->serverDomain); kfree(buf_to_free->serverNOS); kfree_sensitive(buf_to_free->password); + kfree_sensitive(buf_to_free->password2); kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); kfree_sensitive(buf_to_free->auth_key.response); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index b156eefa75d7..78c94d0350fe 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4964,68 +4964,84 @@ static int smb2_next_header(struct TCP_Server_Info *server, char *buf, return 0; } -int cifs_sfu_make_node(unsigned int xid, struct inode *inode, - struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, umode_t mode, dev_t dev) +static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) { - struct cifs_open_info_data buf = {}; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {}; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_fid fid; unsigned int bytes_written; - struct win_dev *pdev; + struct win_dev pdev = {}; struct kvec iov[2]; __u32 oplock = server->oplocks ? REQ_OPLOCK : 0; int rc; - if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode)) + switch (mode & S_IFMT) { + case S_IFCHR: + strscpy(pdev.type, "IntxCHR"); + pdev.major = cpu_to_le64(MAJOR(dev)); + pdev.minor = cpu_to_le64(MINOR(dev)); + break; + case S_IFBLK: + strscpy(pdev.type, "IntxBLK"); + pdev.major = cpu_to_le64(MAJOR(dev)); + pdev.minor = cpu_to_le64(MINOR(dev)); + break; + case S_IFIFO: + strscpy(pdev.type, "LnxFIFO"); + break; + default: return -EPERM; + } - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .cifs_sb = cifs_sb, - .desired_access = GENERIC_WRITE, - .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL), - .disposition = FILE_CREATE, - .path = full_path, - .fid = &fid, - }; + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, GENERIC_WRITE, + FILE_CREATE, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL, ACL_NO_MODE); + oparms.fid = &fid; - rc = server->ops->open(xid, &oparms, &oplock, &buf); + rc = server->ops->open(xid, &oparms, &oplock, NULL); if (rc) return rc; - /* - * BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely. - */ - pdev = (struct win_dev *)&buf.fi; io_parms.pid = current->tgid; io_parms.tcon = tcon; - io_parms.length = sizeof(*pdev); - iov[1].iov_base = pdev; - iov[1].iov_len = sizeof(*pdev); - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - } else if (S_ISFIFO(mode)) { - memcpy(pdev->type, "LnxFIFO", 8); - } + io_parms.length = sizeof(pdev); + iov[1].iov_base = &pdev; + iov[1].iov_len = sizeof(pdev); rc = server->ops->sync_write(xid, &fid, &io_parms, &bytes_written, iov, 1); server->ops->close(xid, tcon, &fid); - d_drop(dentry); - /* FIXME: add code here to set EAs */ - cifs_free_open_info(&buf); + return rc; +} + +int cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) +{ + struct inode *new = NULL; + int rc; + + rc = __cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); + if (rc) + return rc; + + if (tcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&new, full_path, NULL, + inode->i_sb, xid); + } else if (tcon->unix_ext) { + rc = cifs_get_inode_info_unix(&new, full_path, + inode->i_sb, xid); + } else { + rc = cifs_get_inode_info(&new, full_path, NULL, + inode->i_sb, xid, NULL); + } + if (!rc) + d_instantiate(dentry, new); return rc; } diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index c0c4933af5fc..86c647a947cc 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -367,6 +367,17 @@ again: } rc = cifs_setup_session(0, ses, server, nls_codepage); + if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { + /* + * Try alternate password for next reconnect (key rotation + * could be enabled on the server e.g.) if an alternate + * password is available and the current password is expired, + * but do not swap on non pwd related errors like host down + */ + if (ses->password2) + swap(ses->password2, ses->password); + } + if ((rc == -EACCES) && !tcon->retry) { mutex_unlock(&ses->session_mutex); rc = -EHOSTDOWN; diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index aa3411354e66..16bd693d0b3a 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -48,6 +48,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, gid_t i_gid; int err; + inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); + if (inode->i_ino == 0) + return -EINVAL; + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid); if (err) return err; @@ -58,7 +62,6 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); - inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0); inode_set_atime(inode, inode_get_mtime_sec(inode), 0); inode_set_ctime(inode, inode_get_mtime_sec(inode), 0); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 6b7652fb8050..7cd64021d453 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -463,6 +463,8 @@ struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, kn = kernfs_find_and_get(kobj->sd, attr->name); if (kn) kernfs_break_active_protection(kn); + else + kobject_put(kobj); return kn; } EXPORT_SYMBOL_GPL(sysfs_break_active_protection); diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index dc067eeb6387..894c6ca1e500 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -336,6 +336,7 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode, /** * lookup_file - look up a file in the tracefs filesystem + * @parent_ei: Pointer to the eventfs_inode that represents parent of the file * @dentry: the dentry to look up * @mode: the permission that the file should have. * @attr: saved attributes changed by user @@ -389,6 +390,7 @@ static struct dentry *lookup_file(struct eventfs_inode *parent_ei, /** * lookup_dir_entry - look up a dir in the tracefs filesystem * @dentry: the directory to look up + * @pei: Pointer to the parent eventfs_inode if available * @ei: the eventfs_inode that represents the directory to create * * This function will look up a dentry for a directory represented by @@ -478,16 +480,20 @@ void eventfs_d_release(struct dentry *dentry) /** * lookup_file_dentry - create a dentry for a file of an eventfs_inode + * @dentry: The parent dentry under which the new file's dentry will be created * @ei: the eventfs_inode that the file will be created under * @idx: the index into the entry_attrs[] of the @ei - * @parent: The parent dentry of the created file. - * @name: The name of the file to create * @mode: The mode of the file. * @data: The data to use to set the inode of the file with on open() * @fops: The fops of the file to be created. * - * Create a dentry for a file of an eventfs_inode @ei and place it into the - * address located at @e_dentry. + * This function creates a dentry for a file associated with an + * eventfs_inode @ei. It uses the entry attributes specified by @idx, + * if available. The file will have the specified @mode and its inode will be + * set up with @data upon open. The file operations will be set to @fops. + * + * Return: Returns a pointer to the newly created file's dentry or an error + * pointer. */ static struct dentry * lookup_file_dentry(struct dentry *dentry, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index c6a124e8d565..964fa7f24003 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1048,7 +1048,7 @@ static int zonefs_init_zgroup(struct super_block *sb, zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", zonefs_zgroup_name(ztype), zgroup->g_nr_zones, - zgroup->g_nr_zones > 1 ? "s" : ""); + str_plural(zgroup->g_nr_zones)); return 0; } |