diff options
Diffstat (limited to 'fs')
209 files changed, 4186 insertions, 1353 deletions
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 698c43dd5dc8..f28bc763847a 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -202,7 +202,7 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) return inode->i_sb->s_fs_info; } -static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry) +static inline struct v9fs_session_info *v9fs_dentry2v9ses(const struct dentry *dentry) { return dentry->d_sb->s_fs_info; } diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 01338d4c2d9e..5061f192eafd 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -61,7 +61,7 @@ static void v9fs_dentry_release(struct dentry *dentry) p9_fid_put(hlist_entry(p, struct p9_fid, dlist)); } -static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) { struct p9_fid *fid; struct inode *inode; @@ -99,14 +99,36 @@ out_valid: return 1; } +static int v9fs_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) +{ + return __v9fs_lookup_revalidate(dentry, flags); +} + +static bool v9fs_dentry_unalias_trylock(const struct dentry *dentry) +{ + struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry); + return down_write_trylock(&v9ses->rename_sem); +} + +static void v9fs_dentry_unalias_unlock(const struct dentry *dentry) +{ + struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry); + up_write(&v9ses->rename_sem); +} + const struct dentry_operations v9fs_cached_dentry_operations = { .d_revalidate = v9fs_lookup_revalidate, - .d_weak_revalidate = v9fs_lookup_revalidate, + .d_weak_revalidate = __v9fs_lookup_revalidate, .d_delete = v9fs_cached_dentry_delete, .d_release = v9fs_dentry_release, + .d_unalias_trylock = v9fs_dentry_unalias_trylock, + .d_unalias_unlock = v9fs_dentry_unalias_unlock, }; const struct dentry_operations v9fs_dentry_operations = { .d_delete = always_delete_dentry, .d_release = v9fs_dentry_release, + .d_unalias_trylock = v9fs_dentry_unalias_trylock, + .d_unalias_unlock = v9fs_dentry_unalias_unlock, }; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index a843c36fc471..02cbf38e1a77 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -23,7 +23,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); static int afs_dir_open(struct inode *inode, struct file *file); static int afs_readdir(struct file *file, struct dir_context *ctx); -static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); +static int afs_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_iput(struct dentry *dentry, struct inode *inode); static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, @@ -597,19 +598,19 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, * Do a lookup of a single name in a directory * - just returns the FID the dentry name maps to if found */ -static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, +static int afs_do_lookup_one(struct inode *dir, const struct qstr *name, struct afs_fid *fid, afs_dataversion_t *_dir_version) { struct afs_super_info *as = dir->i_sb->s_fs_info; struct afs_lookup_one_cookie cookie = { .ctx.actor = afs_lookup_one_filldir, - .name = dentry->d_name, + .name = *name, .fid.vid = as->volume->vid }; int ret; - _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); + _enter("{%lu},{%.*s},", dir->i_ino, name->len, name->name); /* search the directory */ ret = afs_dir_iterate(dir, &cookie.ctx, NULL, _dir_version); @@ -1023,21 +1024,12 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, /* * Check the validity of a dentry under RCU conditions. */ -static int afs_d_revalidate_rcu(struct dentry *dentry) +static int afs_d_revalidate_rcu(struct afs_vnode *dvnode, struct dentry *dentry) { - struct afs_vnode *dvnode; - struct dentry *parent; - struct inode *dir; long dir_version, de_version; _enter("%p", dentry); - /* Check the parent directory is still valid first. */ - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - dvnode = AFS_FS_I(dir); if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) return -ECHILD; @@ -1065,11 +1057,11 @@ static int afs_d_revalidate_rcu(struct dentry *dentry) * - NOTE! the hit can be a negative hit too, so we can't assume we have an * inode */ -static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) +static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - struct afs_vnode *vnode, *dir; + struct afs_vnode *vnode, *dir = AFS_FS_I(parent_dir); struct afs_fid fid; - struct dentry *parent; struct inode *inode; struct key *key; afs_dataversion_t dir_version, invalid_before; @@ -1077,7 +1069,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) int ret; if (flags & LOOKUP_RCU) - return afs_d_revalidate_rcu(dentry); + return afs_d_revalidate_rcu(dir, dentry); if (d_really_is_positive(dentry)) { vnode = AFS_FS_I(d_inode(dentry)); @@ -1092,14 +1084,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (IS_ERR(key)) key = NULL; - /* Hold the parent dentry so we can peer at it */ - parent = dget_parent(dentry); - dir = AFS_FS_I(d_inode(parent)); - /* validate the parent directory */ ret = afs_validate(dir, key); if (ret == -ERESTARTSYS) { - dput(parent); key_put(key); return ret; } @@ -1127,7 +1114,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, &dir_version); + ret = afs_do_lookup_one(&dir->netfs.inode, name, &fid, &dir_version); switch (ret) { case 0: /* the filename maps to something */ @@ -1171,22 +1158,19 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_valid; default: - _debug("failed to iterate dir %pd: %d", - parent, ret); + _debug("failed to iterate parent %pd2: %d", dentry, ret); goto not_found; } out_valid: dentry->d_fsdata = (void *)(unsigned long)dir_version; out_valid_noupdate: - dput(parent); key_put(key); _leave(" = 1 [valid]"); return 1; not_found: _debug("dropping dentry %pd2", dentry); - dput(parent); key_put(key); _leave(" = 0 [bad]"); @@ -224,7 +224,7 @@ static unsigned long aio_nr; /* current system wide number of aio requests */ static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ #ifdef CONFIG_SYSCTL -static struct ctl_table aio_sysctls[] = { +static const struct ctl_table aio_sysctls[] = { { .procname = "aio-nr", .data = &aio_nr, diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 42bd1cb7c9cd..583ac81669c2 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -60,14 +60,14 @@ static struct inode *anon_inode_make_secure_inode( const struct inode *context_inode) { struct inode *inode; - const struct qstr qname = QSTR_INIT(name, strlen(name)); int error; inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); if (IS_ERR(inode)) return inode; inode->i_flags &= ~S_PRIVATE; - error = security_inode_init_security_anon(inode, &qname, context_inode); + error = security_inode_init_security_anon(inode, &QSTR(name), + context_inode); if (error) { iput(inode); return ERR_PTR(error); diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 85eea7a4dea3..fc7efd0a7525 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -61,6 +61,13 @@ config BCACHEFS_DEBUG The resulting code will be significantly slower than normal; you probably shouldn't select this option unless you're a developer. +config BCACHEFS_INJECT_TRANSACTION_RESTARTS + bool "Randomly inject transaction restarts" + depends on BCACHEFS_DEBUG + help + Randomly inject transaction restarts in a few core paths - may have a + significant performance penalty + config BCACHEFS_TESTS bool "bcachefs unit and performance tests" depends on BCACHEFS_FS diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index fc2ef33b67b3..3ea809990ef1 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1803,7 +1803,6 @@ struct discard_buckets_state { u64 open; u64 need_journal_commit; u64 discarded; - u64 need_journal_commit_this_dev; }; static int bch2_discard_one_bucket(struct btree_trans *trans, @@ -1827,11 +1826,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - pos.inode, pos.offset)) { - s->need_journal_commit++; - s->need_journal_commit_this_dev++; + u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, + pos.inode, pos.offset); + if (seq_ready > c->journal.flushed_seq_ondisk) { + if (seq_ready > c->journal.flushing_seq) + s->need_journal_commit++; goto out; } @@ -1865,23 +1864,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, discard_locked = true; } - if (!bkey_eq(*discard_pos_done, iter.pos) && - ca->mi.discard && !c->opts.nochanges) { - /* - * This works without any other locks because this is the only - * thread that removes items from the need_discard tree - */ - bch2_trans_unlock_long(trans); - blkdev_issue_discard(ca->disk_sb.bdev, - k.k->p.offset * ca->mi.bucket_size, - ca->mi.bucket_size, - GFP_KERNEL); - *discard_pos_done = iter.pos; + if (!bkey_eq(*discard_pos_done, iter.pos)) { s->discarded++; + *discard_pos_done = iter.pos; - ret = bch2_trans_relock_notrace(trans); - if (ret) - goto out; + if (ca->mi.discard && !c->opts.nochanges) { + /* + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree + */ + bch2_trans_unlock_long(trans); + blkdev_issue_discard(ca->disk_sb.bdev, + k.k->p.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL); + ret = bch2_trans_relock_notrace(trans); + if (ret) + goto out; + } } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); @@ -1929,6 +1929,9 @@ static void bch2_do_discards_work(struct work_struct *work) POS(ca->dev_idx, U64_MAX), 0, k, bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); + if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal)) + bch2_journal_flush_async(&c->journal, NULL); + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); @@ -2024,7 +2027,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } - trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_trans_put(trans); percpu_ref_put(&ca->io_ref); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 6df41c331a52..5a781fb4c794 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -205,8 +205,12 @@ static inline bool may_alloc_bucket(struct bch_fs *c, return false; } - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) { + u64 journal_seq_ready = + bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, + bucket.inode, bucket.offset); + if (journal_seq_ready > c->journal.flushed_seq_ondisk) { + if (journal_seq_ready > c->journal.flushing_seq) + s->need_journal_commit++; s->skipped_need_journal_commit++; return false; } @@ -570,7 +574,7 @@ alloc: ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); - if (s.skipped_need_journal_commit * 2 > avail) + if (s.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 9bbb28e90b93..4aa8ee026cb8 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -18,6 +18,7 @@ struct bucket_alloc_state { u64 buckets_seen; u64 skipped_open; u64 skipped_need_journal_commit; + u64 need_journal_commit; u64 skipped_nocow; u64 skipped_nouse; u64 skipped_mi_btree_bitmap; diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 672ca2c1d37d..ca755e8d1a37 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -24,7 +24,10 @@ do { \ } while (0) const char * const bch2_btree_node_flags[] = { -#define x(f) #f, + "typebit", + "typebit", + "typebit", +#define x(f) [BTREE_NODE_##f] = #f, BTREE_FLAGS() #undef x NULL diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 367231ab1980..e32fce4fd258 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2239,8 +2239,6 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos if (unlikely(ret)) return bkey_s_c_err(ret); - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); - k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); if (!k.k) return k; @@ -2251,6 +2249,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos iter->k = u; k.k = &iter->k; + btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); return k; } @@ -2358,6 +2357,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_intent); @@ -2623,6 +2628,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); + int ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + while (1) { k = __bch2_btree_iter_peek_prev(iter, search_key); if (unlikely(!k.k)) @@ -2750,6 +2761,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { @@ -3107,6 +3124,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (ret) + return ERR_PTR(ret); + struct btree_transaction_stats *s = btree_trans_stats(trans); s->max_mem = max(s->max_mem, new_bytes); @@ -3164,7 +3185,8 @@ out_new_mem: if (old_bytes) { trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); + return ERR_PTR(btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } out_change_top: p = trans->mem + trans->mem_top; @@ -3272,6 +3294,14 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->last_begin_ip = _RET_IP_; +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (trans->restarted) { + trans->restart_count_this_trans++; + } else { + trans->restart_count_this_trans = 0; + } +#endif + trans_set_locked(trans, false); if (trans->restarted) { diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b9538e6e6d65..b96157f3dc9c 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -355,6 +355,18 @@ static int btree_trans_restart(struct btree_trans *trans, int err) return btree_trans_restart_ip(trans, err, _THIS_IP_); } +static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) { + trace_and_count(trans->c, trans_restart_injected, trans, ip); + return btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_fault_inject, ip); + } +#endif + return 0; +} + bool bch2_btree_node_upgrade(struct btree_trans *, struct btree_path *, unsigned); @@ -739,7 +751,7 @@ transaction_restart: \ if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ \ - _ret2 ?: trans_was_restarted(_trans, _restart_count); \ + _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ }) #define for_each_btree_key_max_continue(_trans, _iter, \ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 3b62296c3100..1821f40c161a 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -291,8 +291,10 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, struct btree_path *ck_path, unsigned flags) { - if (flags & BTREE_ITER_cached_nofill) + if (flags & BTREE_ITER_cached_nofill) { + ck_path->l[0].b = NULL; return 0; + } struct bch_fs *c = trans->c; struct btree_iter iter; @@ -746,7 +748,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) rcu_read_unlock(); mutex_lock(&bc->table.mutex); mutex_unlock(&bc->table.mutex); - rcu_read_lock(); continue; } for (i = 0; i < tbl->size; i++) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 6b79b672e0b1..c4f524b2ca9a 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -348,7 +348,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, unsigned flags) { return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, - trans->journal_u64s, flags); + trans->journal_u64s, flags, trans); } #define JSET_ENTRY_LOG_U64s 4 @@ -999,6 +999,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) bch2_trans_verify_not_unlocked_or_in_restart(trans); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) + goto out_reset; + if (!trans->nr_updates && !trans->journal_entries_u64s) goto out_reset; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index a6f251eb4164..a09cbe9cd94f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -509,6 +509,9 @@ struct btree_trans { bool notrace_relock_fail:1; enum bch_errcode restarted:16; u32 restart_count; +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + u32 restart_count_this_trans; +#endif u64 last_begin_time; unsigned long last_begin_ip; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index f4aeadbe53c1..e4e7c804625e 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -681,9 +681,11 @@ static void btree_update_nodes_written(struct btree_update *as) b = as->old_nodes[i]; + bch2_trans_begin(trans); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); seq = b->data ? b->data->keys.seq : 0; six_unlock_read(&b->c.lock); + bch2_trans_unlock_long(trans); if (seq == as->old_nodes_seq[i]) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7930ffea3075..26d646e1275c 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -278,12 +278,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt { struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), - (void *) btree_bkey_last(b, bset_tree_last(b))); + (void *) btree_bkey_last(b, t)); ssize_t remaining_space = __bch2_btree_u64s_remaining(b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { - if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) + if (b->written + block_sectors(c) <= btree_sectors(c)) return bne; } else { if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index f9fb150eda70..c8a488e6b7b8 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -22,23 +22,21 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_ memset(t->d, 0, sizeof(t->d[0]) << t->bits); } -bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, - u64 flushed_seq, - unsigned dev, u64 bucket) +u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b, + unsigned dev, u64 bucket) { struct buckets_waiting_for_journal_table *t; u64 dev_bucket = (u64) dev << 56 | bucket; - bool ret = false; - unsigned i; + u64 ret = 0; mutex_lock(&b->lock); t = b->t; - for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); if (h->dev_bucket == dev_bucket) { - ret = h->journal_seq > flushed_seq; + ret = h->journal_seq; break; } } diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h index d2ae19cbe18c..365619ca44c8 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.h +++ b/fs/bcachefs/buckets_waiting_for_journal.h @@ -4,8 +4,8 @@ #include "buckets_waiting_for_journal_types.h" -bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, - u64, unsigned, u64); +u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *, + unsigned, u64); int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, u64, unsigned, u64, u64); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index f99ff1819597..114bf2f3879f 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -4,6 +4,7 @@ #include "compress.h" #include "error.h" #include "extents.h" +#include "io_write.h" #include "opts.h" #include "super-io.h" @@ -254,11 +255,14 @@ err: goto out; } -int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, - struct bch_extent_crc_unpacked *crc) +int bch2_bio_uncompress_inplace(struct bch_write_op *op, + struct bio *bio) { + struct bch_fs *c = op->c; + struct bch_extent_crc_unpacked *crc = &op->crc; struct bbuf data = { NULL }; size_t dst_len = crc->uncompressed_size << 9; + int ret = 0; /* bio must own its pages: */ BUG_ON(!bio->bi_vcnt); @@ -266,17 +270,26 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || crc->compressed_size << 9 > c->opts.encoded_extent_max) { - bch_err(c, "error rewriting existing data: extent too big"); + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "error rewriting existing data: extent too big"); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); return -EIO; } data = __bounce_alloc(c, dst_len, WRITE); if (__bio_uncompress(c, bio, data.b, *crc)) { - if (!c->opts.no_data_io) - bch_err(c, "error rewriting existing data: decompression error"); - bio_unmap_or_unbounce(c, data); - return -EIO; + if (!c->opts.no_data_io) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "error rewriting existing data: decompression error"); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } + ret = -EIO; + goto err; } /* @@ -293,9 +306,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, crc->uncompressed_size = crc->live_size; crc->offset = 0; crc->csum = (struct bch_csum) { 0, 0 }; - +err: bio_unmap_or_unbounce(c, data); - return 0; + return ret; } int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 607fd5e232c9..bec2f05bfd52 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,8 +47,8 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } -int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, - struct bch_extent_crc_unpacked *); +struct bch_write_op; +int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, struct bvec_iter, struct bch_extent_crc_unpacked); unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 585214931e05..337494facac6 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -91,15 +91,28 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc return true; } -static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) +static noinline void trace_move_extent_finish2(struct data_update *u, + struct bkey_i *new, + struct bkey_i *insert) { - if (trace_move_extent_finish_enabled()) { - struct printbuf buf = PRINTBUF; + struct bch_fs *c = u->op.c; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - trace_move_extent_finish(c, buf.buf); - printbuf_exit(&buf); - } + prt_newline(&buf); + + bch2_data_update_to_text(&buf, u); + prt_newline(&buf); + + prt_str_indented(&buf, "new replicas:\t"); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + prt_newline(&buf); + + prt_str_indented(&buf, "insert:\t"); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_newline(&buf); + + trace_move_extent_finish(c, buf.buf); + printbuf_exit(&buf); } static void trace_move_extent_fail2(struct data_update *m, @@ -372,7 +385,8 @@ restart_drop_extra_replicas: bch2_btree_iter_set_pos(&iter, next_pos); this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); - trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); + if (trace_move_extent_finish_enabled()) + trace_move_extent_finish2(m, &new->k_i, insert); } err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -525,34 +539,38 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, struct data_update_opts *data_opts) { printbuf_tabstop_push(out, 20); - prt_str(out, "rewrite ptrs:\t"); + + prt_str_indented(out, "rewrite ptrs:\t"); bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); prt_newline(out); - prt_str(out, "kill ptrs:\t"); + prt_str_indented(out, "kill ptrs:\t"); bch2_prt_u64_base2(out, data_opts->kill_ptrs); prt_newline(out); - prt_str(out, "target:\t"); + prt_str_indented(out, "target:\t"); bch2_target_to_text(out, c, data_opts->target); prt_newline(out); - prt_str(out, "compression:\t"); + prt_str_indented(out, "compression:\t"); bch2_compression_opt_to_text(out, io_opts->background_compression); prt_newline(out); - prt_str(out, "opts.replicas:\t"); + prt_str_indented(out, "opts.replicas:\t"); prt_u64(out, io_opts->data_replicas); + prt_newline(out); - prt_str(out, "extra replicas:\t"); + prt_str_indented(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); } void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) { - bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); - prt_newline(out); bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); + prt_newline(out); + + prt_str_indented(out, "old key:\t"); + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); } int bch2_extent_drop_ptrs(struct btree_trans *trans, diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index b5de52a50d10..55333e82d1fe 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -20,6 +20,7 @@ #include "extents.h" #include "fsck.h" #include "inode.h" +#include "journal_reclaim.h" #include "super.h" #include <linux/console.h> diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 5360cbb3ec29..f4372cafea2e 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -210,11 +210,13 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem * static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, u64 *v, unsigned nr) { + percpu_down_read(&c->mark_lock); struct bch_accounting_mem *acc = &c->accounting; unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &p); bch2_accounting_mem_read_counters(acc, idx, v, nr, false); + percpu_up_read(&c->mark_lock); } static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 8fcf7c8e5ede..9bf316e7b845 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -450,7 +450,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * return ret; struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); - struct qstr name = (struct qstr) QSTR(name_buf); + struct qstr name = QSTR(name_buf); inode->bi_dir = lostfound.bi_inum; @@ -823,6 +823,7 @@ struct inode_walker_entry { struct bch_inode_unpacked inode; u32 snapshot; u64 count; + u64 i_size; }; struct inode_walker { @@ -910,8 +911,9 @@ found: if (k.k->p.snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - new.snapshot = k.k->p.snapshot; - new.count = 0; + new.snapshot = k.k->p.snapshot; + new.count = 0; + new.i_size = 0; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); @@ -1116,37 +1118,6 @@ err: return ret; } -static int check_directory_size(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u, - struct bkey_s_c inode_k, bool *write_inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 new_size = 0; - int ret; - - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, - SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot), - POS(inode_k.k->p.offset, U64_MAX), - 0, k, ret) { - if (k.k->type != KEY_TYPE_dirent) - continue; - - struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k); - struct qstr name = bch2_dirent_get_name(dirent); - - new_size += dirent_occupied_size(&name); - } - bch2_trans_iter_exit(trans, &iter); - - if (!ret && inode_u->bi_size != new_size) { - inode_u->bi_size = new_size; - *write_inode = true; - } - - return ret; -} - static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1335,16 +1306,6 @@ static int check_inode(struct btree_trans *trans, u.bi_journal_seq = journal_cur_seq(&c->journal); do_update = true; } - - if (S_ISDIR(u.bi_mode)) { - ret = check_directory_size(trans, &u, k, &do_update); - - fsck_err_on(ret, - trans, directory_size_mismatch, - "directory inode %llu:%u with the mismatch directory size", - u.bi_inum, k.k->p.snapshot); - ret = 0; - } do_update: if (do_update) { ret = __bch2_fsck_write_inode(trans, &u); @@ -2017,10 +1978,31 @@ fsck_err: return ret; } -static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + int ret = 0; + + darray_for_each(w->inodes, i) + if (fsck_err_on(i->inode.bi_size != i->i_size, + trans, inode_dir_wrong_nlink, + "directory %llu:%u with wrong i_size: got %llu, should be %llu", + w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) { + i->inode.bi_size = i->i_size; + ret = bch2_fsck_write_inode(trans, &i->inode); + if (ret) + break; + } +fsck_err: + bch_err_fn(c, ret); + return ret; +} + +static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) { u32 restart_count = trans->restart_count; return check_subdir_count_notnested(trans, w) ?: + check_dir_i_size_notnested(trans, w) ?: trans_was_restarted(trans, restart_count); } @@ -2367,7 +2349,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto out; if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { - ret = check_subdir_count(trans, dir); + ret = check_subdir_dirents_count(trans, dir); if (ret) goto err; } @@ -2457,9 +2439,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { + if (d.v->d_type == DT_DIR) i->count++; + i->i_size += bkey_bytes(d.k); + } out: err: fsck_err: diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index d2e134528f0e..428b9be6af34 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -285,12 +285,14 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); +#include "rebalance.h" + static inline struct bch_extent_rebalance bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) { struct bch_io_opts io_opts; bch2_inode_opts_get(&io_opts, c, inode); - return io_opts_to_rebalance_opts(&io_opts); + return io_opts_to_rebalance_opts(c, &io_opts); } int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 3e71860f66b9..03892388832b 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -406,11 +406,21 @@ static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); } -static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) { __bch2_write_op_error(out, op, op->pos.offset); } +static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64 offset) +{ + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + prt_printf(out, "write error%s: ", + op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); +} + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, @@ -873,7 +883,7 @@ static enum prep_encoded_ret { if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) return PREP_ENCODED_CHECKSUM_ERR; - if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) + if (bch2_bio_uncompress_inplace(op, bio)) return PREP_ENCODED_ERR; } @@ -1193,7 +1203,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); struct printbuf buf = PRINTBUF; - __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); + bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index 5400ce94ee57..b4626013abc8 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -20,6 +20,8 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); + #define BCH_WRITE_FLAGS() \ x(ALLOC_NOWAIT) \ x(CACHED) \ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 2cd20114b74b..24c294d4634e 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq) static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) { - unsigned i; - - for (i = 0; i < ARRAY_SIZE(p->list); i++) - INIT_LIST_HEAD(&p->list[i]); - INIT_LIST_HEAD(&p->flushed); + for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) + INIT_LIST_HEAD(&p->unflushed[i]); + for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) + INIT_LIST_HEAD(&p->flushed[i]); atomic_set(&p->count, count); p->devs.nr = 0; } @@ -320,6 +319,16 @@ void bch2_journal_halt(struct journal *j) spin_unlock(&j->lock); } +void bch2_journal_halt_locked(struct journal *j) +{ + lockdep_assert_held(&j->lock); + + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); + journal_wake(j); +} + static bool journal_entry_want_write(struct journal *j) { bool ret = !journal_entry_is_open(j) || @@ -382,9 +391,12 @@ static int journal_entry_open(struct journal *j) if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return JOURNAL_ERR_max_in_flight; - if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX, - c, "cannot start: journal seq overflow")) + if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { + bch_err(c, "cannot start: journal seq overflow"); + if (bch2_fs_emergency_read_only_locked(c)) + bch_err(c, "fatal error - emergency read only"); return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + } BUG_ON(!j->cur_entry_sectors); @@ -601,6 +613,16 @@ out: : -BCH_ERR_journal_res_get_blocked; } +static unsigned max_dev_latency(struct bch_fs *c) +{ + u64 nsecs = 0; + + for_each_rw_member(c, ca) + nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); + + return nsecs_to_jiffies(nsecs); +} + /* * Essentially the entry function to the journaling code. When bcachefs is doing * a btree insert, it calls this function to get the current journal write. @@ -612,17 +634,31 @@ out: * btree node write locks. */ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned flags) + unsigned flags, + struct btree_trans *trans) { int ret; if (closure_wait_event_timeout(&j->async_wait, (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || (flags & JOURNAL_RES_GET_NONBLOCK), - HZ * 10)) + HZ)) return ret; + if (trans) + bch2_trans_unlock_long(trans); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); + + remaining_wait = max(0, remaining_wait - HZ); + + if (closure_wait_event_timeout(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK), + remaining_wait)) + return ret; + struct printbuf buf = PRINTBUF; bch2_journal_debug_to_text(&buf, j); bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", @@ -727,7 +763,7 @@ recheck_need_open: * livelock: */ sched_annotate_sleep(); - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); if (ret) return ret; @@ -760,6 +796,7 @@ recheck_need_open: } buf->must_flush = true; + j->flushing_seq = max(j->flushing_seq, seq); if (parent && !closure_wait(&buf->wait, parent)) BUG(); @@ -848,7 +885,7 @@ out: static int __bch2_journal_meta(struct journal *j) { struct journal_res res = {}; - int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); if (ret) return ret; @@ -1602,54 +1639,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) __bch2_journal_debug_to_text(out, j); spin_unlock(&j->lock); } - -bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *pin; - - spin_lock(&j->lock); - if (!test_bit(JOURNAL_running, &j->flags)) { - spin_unlock(&j->lock); - return true; - } - - *seq = max(*seq, j->pin.front); - - if (*seq >= j->pin.back) { - spin_unlock(&j->lock); - return true; - } - - out->atomic++; - - pin_list = journal_seq_pin(j, *seq); - - prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); - printbuf_indent_add(out, 2); - - for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) - list_for_each_entry(pin, &pin_list->list[i], list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - if (!list_empty(&pin_list->flushed)) - prt_printf(out, "flushed:\n"); - - list_for_each_entry(pin, &pin_list->flushed, list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - printbuf_indent_sub(out, 2); - - --out->atomic; - spin_unlock(&j->lock); - - return false; -} - -void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) -{ - u64 seq = 0; - - while (!bch2_journal_seq_pins_to_text(out, j, &seq)) - seq++; -} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index cb0df0663946..107f7f901cd9 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -312,7 +312,7 @@ static inline void bch2_journal_res_put(struct journal *j, } int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned); + unsigned, struct btree_trans *); /* First bits for BCH_WATERMARK: */ enum journal_res_flags { @@ -368,7 +368,8 @@ static inline int journal_res_get_fast(struct journal *j, } static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s, unsigned flags) + unsigned u64s, unsigned flags, + struct btree_trans *trans) { int ret; @@ -380,7 +381,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re if (journal_res_get_fast(j, res, flags)) goto out; - ret = bch2_journal_res_get_slowpath(j, res, flags); + ret = bch2_journal_res_get_slowpath(j, res, flags, trans); if (ret) return ret; out: @@ -408,6 +409,7 @@ bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); +void bch2_journal_halt_locked(struct journal *); static inline int bch2_journal_error(struct journal *j) { @@ -429,8 +431,6 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -void bch2_journal_pins_to_text(struct printbuf *, struct journal *); -bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 7f2efe85a805..11c39e0c34f4 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,6 +17,7 @@ #include "sb-clean.h" #include "trace.h" +#include <linux/ioprio.h> #include <linux/string_choices.h> void bch2_journal_pos_from_member_info_set(struct bch_fs *c) @@ -1763,6 +1764,7 @@ static CLOSURE_CALLBACK(journal_write_submit) bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); ca->prev_journal_sector = bio->bi_iter.bi_sector; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 3c8242606da7..d373cd181a7f 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -327,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j) popped = true; } - if (popped) + if (popped) { bch2_journal_space_available(j); + __closure_wake_up(&j->reclaim_flush_wait); + } } bool __bch2_journal_pin_put(struct journal *j, u64 seq) @@ -362,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j, pin->seq = 0; list_del_init(&pin->list); + if (j->reclaim_flush_wait.list.first) + __closure_wake_up(&j->reclaim_flush_wait); + /* * Unpinning a journal entry may make journal_next_bucket() succeed, if * writing a new last_seq will now make another bucket available: @@ -379,15 +384,19 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } -static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, + journal_pin_flush_fn fn) { if (fn == bch2_btree_node_flush0 || - fn == bch2_btree_node_flush1) - return JOURNAL_PIN_btree; - else if (fn == bch2_btree_key_cache_journal_flush) - return JOURNAL_PIN_key_cache; + fn == bch2_btree_node_flush1) { + unsigned idx = fn == bch2_btree_node_flush1; + struct btree *b = container_of(pin, struct btree, writes[idx].journal); + + return JOURNAL_PIN_TYPE_btree0 - b->c.level; + } else if (fn == bch2_btree_key_cache_journal_flush) + return JOURNAL_PIN_TYPE_key_cache; else - return JOURNAL_PIN_other; + return JOURNAL_PIN_TYPE_other; } static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, @@ -406,7 +415,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, atomic_inc(&pin_list->count); pin->seq = seq; pin->flush = flush_fn; - list_add(&pin->list, &pin_list->list[type]); + + if (list_empty(&pin_list->unflushed[type]) && + j->reclaim_flush_wait.list.first) + __closure_wake_up(&j->reclaim_flush_wait); + + list_add(&pin->list, &pin_list->unflushed[type]); } void bch2_journal_pin_copy(struct journal *j, @@ -431,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j, bool reclaim = __journal_pin_drop(j, dst); - bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -455,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, bool reclaim = __journal_pin_drop(j, pin); - bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -499,16 +513,15 @@ journal_get_next_pin(struct journal *j, { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; - unsigned i; fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { if (*seq > seq_to_flush && !allowed_above_seq) break; - for (i = 0; i < JOURNAL_PIN_NR; i++) - if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || - ((1U << i) & allowed_above_seq)) { - ret = list_first_entry_or_null(&pin_list->list[i], + for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) + if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) || + (BIT(i) & allowed_above_seq)) { + ret = list_first_entry_or_null(&pin_list->unflushed[i], struct journal_entry_pin, list); if (ret) return ret; @@ -544,8 +557,8 @@ static size_t journal_flush_pins(struct journal *j, } if (min_key_cache) { - allowed_above |= 1U << JOURNAL_PIN_key_cache; - allowed_below |= 1U << JOURNAL_PIN_key_cache; + allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache); + allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache); } cond_resched(); @@ -553,7 +566,9 @@ static size_t journal_flush_pins(struct journal *j, j->last_flushed = jiffies; spin_lock(&j->lock); - pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); + pin = journal_get_next_pin(j, seq_to_flush, + allowed_below, + allowed_above, &seq); if (pin) { BUG_ON(j->flush_in_progress); j->flush_in_progress = pin; @@ -576,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j, spin_lock(&j->lock); /* Pin might have been dropped or rearmed: */ if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); j->flush_in_progress = NULL; j->flush_in_progress_dropped = false; spin_unlock(&j->lock); @@ -816,10 +831,41 @@ int bch2_journal_reclaim_start(struct journal *j) return 0; } +static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, + unsigned types) +{ + struct journal_entry_pin_list *pin_list; + u64 seq; + + spin_lock(&j->lock); + fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { + if (seq > seq_to_flush) + break; + + for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) + if ((BIT(i) & types) && + (!list_empty(&pin_list->unflushed[i]) || + !list_empty(&pin_list->flushed[i]))) { + spin_unlock(&j->lock); + return true; + } + } + spin_unlock(&j->lock); + + return false; +} + +static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush, + unsigned types) +{ + return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) || + journal_pins_still_flushing(j, seq_to_flush, types); +} + static int journal_flush_done(struct journal *j, u64 seq_to_flush, bool *did_work) { - int ret; + int ret = 0; ret = bch2_journal_error(j); if (ret) @@ -827,12 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - if (journal_flush_pins(j, seq_to_flush, - (1U << JOURNAL_PIN_key_cache)| - (1U << JOURNAL_PIN_other), 0, 0, 0) || - journal_flush_pins(j, seq_to_flush, - (1U << JOURNAL_PIN_btree), 0, 0, 0)) - *did_work = true; + for (int type = JOURNAL_PIN_TYPE_NR - 1; + type >= 0; + --type) + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { + *did_work = true; + goto unlock; + } if (seq_to_flush > journal_cur_seq(j)) bch2_journal_entry_close(j); @@ -847,6 +894,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, !fifo_used(&j->pin); spin_unlock(&j->lock); +unlock: mutex_unlock(&j->reclaim_lock); return ret; @@ -860,7 +908,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) if (!test_bit(JOURNAL_running, &j->flags)) return false; - closure_wait_event(&j->async_wait, + closure_wait_event(&j->reclaim_flush_wait, journal_flush_done(j, seq_to_flush, &did_work)); return did_work; @@ -926,3 +974,54 @@ err: return ret; } + +bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; + + spin_lock(&j->lock); + if (!test_bit(JOURNAL_running, &j->flags)) { + spin_unlock(&j->lock); + return true; + } + + *seq = max(*seq, j->pin.front); + + if (*seq >= j->pin.back) { + spin_unlock(&j->lock); + return true; + } + + out->atomic++; + + pin_list = journal_seq_pin(j, *seq); + + prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); + printbuf_indent_add(out, 2); + + prt_printf(out, "unflushed:\n"); + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) + list_for_each_entry(pin, &pin_list->unflushed[i], list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); + + prt_printf(out, "flushed:\n"); + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++) + list_for_each_entry(pin, &pin_list->flushed[i], list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); + + printbuf_indent_sub(out, 2); + + --out->atomic; + spin_unlock(&j->lock); + + return false; +} + +void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +{ + u64 seq = 0; + + while (!bch2_journal_seq_pins_to_text(out, j, &seq)) + seq++; +} diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index ec84c3345281..0a73d7134e1c 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j) int bch2_journal_flush_device_pins(struct journal *, int); +void bch2_journal_pins_to_text(struct printbuf *, struct journal *); +bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); + #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index e9bd716fbb71..1ef3a28ed6ab 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -53,15 +53,18 @@ struct journal_buf { */ enum journal_pin_type { - JOURNAL_PIN_btree, - JOURNAL_PIN_key_cache, - JOURNAL_PIN_other, - JOURNAL_PIN_NR, + JOURNAL_PIN_TYPE_btree3, + JOURNAL_PIN_TYPE_btree2, + JOURNAL_PIN_TYPE_btree1, + JOURNAL_PIN_TYPE_btree0, + JOURNAL_PIN_TYPE_key_cache, + JOURNAL_PIN_TYPE_other, + JOURNAL_PIN_TYPE_NR, }; struct journal_entry_pin_list { - struct list_head list[JOURNAL_PIN_NR]; - struct list_head flushed; + struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; + struct list_head flushed[JOURNAL_PIN_TYPE_NR]; atomic_t count; struct bch_devs_list devs; }; @@ -226,6 +229,7 @@ struct journal { /* Used when waiting because the journal was full */ wait_queue_head_t wait; struct closure_waitlist async_wait; + struct closure_waitlist reclaim_flush_wait; struct delayed_work write_work; struct workqueue_struct *wq; @@ -236,6 +240,7 @@ struct journal { /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; u64 flushed_seq_ondisk; + u64 flushing_seq; u64 last_seq_ondisk; u64 err_seq; u64 last_empty_seq; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 85c361e78ba5..21805509ab9e 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -215,7 +215,8 @@ static int bch2_copygc(struct moving_context *ctxt, }; move_buckets buckets = { 0 }; struct move_bucket_in_flight *f; - u64 moved = atomic64_read(&ctxt->stats->sectors_moved); + u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); + u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); @@ -245,7 +246,6 @@ static int bch2_copygc(struct moving_context *ctxt, *did_work = true; } err: - darray_exit(&buckets); /* no entries in LRU btree found, or got to end: */ if (bch2_err_matches(ret, ENOENT)) @@ -254,8 +254,11 @@ err: if (ret < 0 && !bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "from bch2_move_data()"); - moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; - trace_and_count(c, copygc, c, moved, 0, 0, 0); + sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; + sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; + trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); + + darray_exit(&buckets); return ret; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index e763d52e0f38..9d397fc2a1f0 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -476,13 +476,13 @@ enum fsck_err_opts { NULL, "Enable nocow mode: enables runtime locking in\n"\ "data move path needed if nocow will ever be in use\n")\ x(copygc_enabled, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable copygc: disable for debugging, or to\n"\ "quiet the system when doing performance testing\n")\ x(rebalance_enabled, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable rebalance: disable for debugging, or to\n"\ @@ -659,18 +659,4 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); -/* rebalance opts: */ - -static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts) -{ - return (struct bch_extent_rebalance) { - .type = BIT(BCH_EXTENT_ENTRY_rebalance), -#define x(_name) \ - ._name = opts->_name, \ - ._name##_from_inode = opts->_name##_from_inode, - BCH_REBALANCE_OPTS() -#undef x - }; -}; - #endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 4adc74cd3f70..d0a1f5cd5c2b 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -121,12 +121,10 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } } incompressible: - if (opts->background_target && - bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) { + if (opts->background_target) bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; - } return sectors; } @@ -140,7 +138,7 @@ static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opt const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts); + struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts); return old == NULL || memcmp(old, &new, sizeof(new)); } else { return old != NULL; @@ -163,7 +161,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, k.k->u64s += sizeof(*old) / sizeof(u64); } - *old = io_opts_to_rebalance_opts(opts); + *old = io_opts_to_rebalance_opts(c, opts); } else { if (old) extent_entry_drop(k, (union bch_extent_entry *) old); diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 0a0821ab895d..62a3859d3823 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -4,8 +4,28 @@ #include "compress.h" #include "disk_groups.h" +#include "opts.h" #include "rebalance_types.h" +static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, + struct bch_io_opts *opts) +{ + struct bch_extent_rebalance r = { + .type = BIT(BCH_EXTENT_ENTRY_rebalance), +#define x(_name) \ + ._name = opts->_name, \ + ._name##_from_inode = opts->_name##_from_inode, + BCH_REBALANCE_OPTS() +#undef x + }; + + if (r.background_target && + !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) + r.background_target = 0; + + return r; +}; + u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); int bch2_get_update_rebalance_opts(struct btree_trans *, diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 98825437381c..71c786cdb192 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -32,7 +32,6 @@ #include <linux/sort.h> #include <linux/stat.h> -#define QSTR(n) { { { .len = strlen(n) } }, .name = n } int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) { diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 93ba4f4e47ca..441e648f28b5 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -172,7 +172,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, bool should_commit) { if (REFLINK_P_ERROR(p.v)) - return -BCH_ERR_missing_indirect_extent; + return 0; struct bch_fs *c = trans->c; u64 live_start = REFLINK_P_IDX(p.v); @@ -259,8 +259,6 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, return k; if (unlikely(!bkey_extent_is_reflink_data(k.k))) { - bch2_trans_iter_exit(trans, iter); - unsigned size = min((u64) k.k->size, REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - reflink_offset); @@ -268,14 +266,16 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, k.k->p.offset, should_commit); - if (ret) + if (ret) { + bch2_trans_iter_exit(trans, iter); return bkey_s_c_err(ret); + } } else if (unlikely(REFLINK_P_ERROR(p.v))) { - bch2_trans_iter_exit(trans, iter); - int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); - if (ret) + if (ret) { + bch2_trans_iter_exit(trans, iter); return bkey_s_c_err(ret); + } } *offset_into_extent = reflink_offset - bkey_start_offset(k.k); @@ -300,7 +300,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, if (ret) return ret; - if (bkey_deleted(k.k)) { + if (!bkey_refcount_c(k)) { if (!(flags & BTREE_TRIGGER_overwrite)) ret = -BCH_ERR_missing_indirect_extent; goto next; diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 14f6b6a5fb38..35e07bc8fbd3 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -92,7 +92,7 @@ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ BCH_FSCK_ERR_accounting_key_junk_at_end) \ x(directory_size, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ BCH_FSCK_ERR_directory_size_mismatch) \ #define DOWNGRADE_TABLE() \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 0b4fe899209b..b86ec013d7d7 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -57,7 +57,7 @@ enum bch_fsck_flags { x(bset_wrong_sector_offset, 44, 0) \ x(bset_empty, 45, 0) \ x(bset_bad_seq, 46, 0) \ - x(bset_blacklisted_journal_seq, 47, 0) \ + x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \ x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ x(btree_node_bad_btree, 49, 0) \ x(btree_node_bad_level, 50, 0) \ @@ -180,9 +180,9 @@ enum bch_fsck_flags { x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ - x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ x(reflink_v_pos_bad, 292, 0) \ - x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \ x(reflink_refcount_underflow, 293, 0) \ x(stripe_pos_bad, 167, 0) \ x(stripe_val_size_bad, 168, 0) \ diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index 8c2c5539de2e..d78451c2a0c6 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -31,11 +31,11 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir } } -static int fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old) +static noinline int fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old) { struct qstr old_name = bch2_dirent_get_name(old); struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); @@ -71,11 +71,11 @@ static int fsck_rename_dirent(struct btree_trans *trans, return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); } -static int hash_pick_winner(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c k1, - struct bkey_s_c k2) +static noinline int hash_pick_winner(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c k1, + struct bkey_s_c k2) { if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) @@ -142,8 +142,8 @@ fsck_err: * All versions of the same inode in different snapshots must have the same hash * seed/type: verify that the hash info we're using matches the root */ -static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, - struct bch_hash_info *hash_info) +static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, + struct bch_hash_info *hash_info) { struct bch_fs *c = trans->c; struct btree_iter iter; diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index e3d0475232e5..b7b96283c316 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -428,7 +428,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, POS(0, snapid), 0, snapshot); - ret = bkey_err(subvol); + ret = bkey_err(snapshot); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing snapshot %u", snapid); if (ret) @@ -440,6 +440,11 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, BTREE_ID_snapshot_trees, POS(0, treeid), 0, snapshot_tree); + ret = bkey_err(snapshot_tree); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing snapshot tree %u", treeid); + if (ret) + goto err; if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { struct bkey_i_snapshot_tree *snapshot_tree_mut = diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index d97ea7bd1171..6d97d412fed9 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -411,6 +411,17 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) return ret; } +bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) +{ + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); + + bch2_journal_halt_locked(&c->journal); + bch2_fs_read_only_async(c); + + wake_up(&bch2_read_only_wait); + return ret; +} + static int bch2_fs_read_write_late(struct bch_fs *c) { int ret; diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index fa6d52216510..04f8287eff5c 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -29,6 +29,7 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); +bool bch2_fs_emergency_read_only_locked(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 9d40b7d4ea29..c1b51009edf6 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -727,7 +727,7 @@ DEFINE_EVENT(fs_str, bucket_alloc_fail, TP_ARGS(c, str) ); -TRACE_EVENT(discard_buckets, +DECLARE_EVENT_CLASS(discard_buckets_class, TP_PROTO(struct bch_fs *c, u64 seen, u64 open, u64 need_journal_commit, u64 discarded, const char *err), TP_ARGS(c, seen, open, need_journal_commit, discarded, err), @@ -759,6 +759,18 @@ TRACE_EVENT(discard_buckets, __entry->err) ); +DEFINE_EVENT(discard_buckets_class, discard_buckets, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err) +); + +DEFINE_EVENT(discard_buckets_class, discard_buckets_fast, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err) +); + TRACE_EVENT(bucket_invalidate, TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), TP_ARGS(c, dev, bucket, sectors), @@ -902,32 +914,30 @@ TRACE_EVENT(evacuate_bucket, TRACE_EVENT(copygc, TP_PROTO(struct bch_fs *c, - u64 sectors_moved, u64 sectors_not_moved, - u64 buckets_moved, u64 buckets_not_moved), - TP_ARGS(c, - sectors_moved, sectors_not_moved, - buckets_moved, buckets_not_moved), + u64 buckets, + u64 sectors_seen, + u64 sectors_moved), + TP_ARGS(c, buckets, sectors_seen, sectors_moved), TP_STRUCT__entry( __field(dev_t, dev ) + __field(u64, buckets ) + __field(u64, sectors_seen ) __field(u64, sectors_moved ) - __field(u64, sectors_not_moved ) - __field(u64, buckets_moved ) - __field(u64, buckets_not_moved ) ), TP_fast_assign( __entry->dev = c->dev; + __entry->buckets = buckets; + __entry->sectors_seen = sectors_seen; __entry->sectors_moved = sectors_moved; - __entry->sectors_not_moved = sectors_not_moved; - __entry->buckets_moved = buckets_moved; - __entry->buckets_not_moved = buckets_moved; ), - TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", + TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->sectors_moved, __entry->sectors_not_moved, - __entry->buckets_moved, __entry->buckets_not_moved) + __entry->buckets, + __entry->sectors_seen, + __entry->sectors_moved) ); TRACE_EVENT(copygc_wait, diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 1a1720116071..e7c3541b38f3 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -670,8 +670,6 @@ static inline int cmp_le32(__le32 l, __le32 r) #include <linux/uuid.h> -#define QSTR(n) { { { .len = strlen(n) } }, .name = n } - static inline bool qstr_eq(const struct qstr l, const struct qstr r) { return l.len == r.len && !memcmp(l.name, r.name, l.len); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 92071ca0655f..3dc5a35dd19b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1496,6 +1496,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { btrfs_unlock_up_safe(p, parent_level + 1); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); @@ -1539,6 +1540,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { ASSERT(ret == -EAGAIN); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a2cac9d0a1a9..b2fae67f8fa3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -523,8 +523,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) u64 end; u32 len; - /* For now only order 0 folios are supported for data. */ - ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, @@ -552,7 +550,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles @@ -561,9 +558,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Here we should only zero the range inside the folio, * not touch anything else. * - * NOTE: i_size is exclusive while end is inclusive. + * NOTE: i_size is exclusive while end is inclusive and + * folio_contains() takes PAGE_SIZE units. */ - if (folio_index(folio) == end_index && i_size <= end) { + if (folio_contains(folio, i_size >> PAGE_SHIFT) && + i_size <= end) { u32 zero_start = max(offset_in_folio(folio, i_size), offset_in_folio(folio, start)); u32 zero_len = offset_in_folio(folio, end) + 1 - @@ -899,7 +898,6 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, u64 len, struct extent_map **em_cached) { struct extent_map *em; - struct extent_state *cached_state = NULL; ASSERT(em_cached); @@ -915,14 +913,12 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, *em_cached = NULL; } - btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state); em = btrfs_get_extent(inode, folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } - unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state); return em; } @@ -956,7 +952,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, return ret; } - if (folio->index == last_byte >> folio_shift(folio)) { + if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { @@ -1079,11 +1075,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { + struct btrfs_inode *inode = folio_to_inode(folio); + const u64 start = folio_pos(folio); + const u64 end = start + folio_size(folio) - 1; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); + free_extent_map(em_cached); /* @@ -2380,12 +2383,20 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; struct folio *folio; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + const u64 start = readahead_pos(rac); + const u64 end = start + readahead_length(rac) - 1; + struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + unlock_extent(&inode->io_tree, start, end, &cached_state); + if (em_cached) free_extent_map(em_cached); submit_one_bio(&bio_ctrl); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 36f51c311bb1..ed3c0d6546c5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1039,7 +1039,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; - loff_t start_pos; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or @@ -1066,9 +1065,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) inode_inc_iversion(inode); } - start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); - if (start_pos > oldsize) { + if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 30eceaf829a7..4aca7475fd82 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1229,6 +1229,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b90fabe302e6..f9d3766c809b 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1880,11 +1880,7 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su * Commit current transaction to make sure all the rfer/excl numbers * get updated. */ - trans = btrfs_start_transaction(fs_info->quota_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(fs_info->quota_root); if (ret < 0) return ret; @@ -1897,8 +1893,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su /* * It's squota and the subvolume still has numbers needed for future * accounting, in this case we can not delete it. Just skip it. + * + * Or the qgroup is already removed by a qgroup rescan. For both cases we're + * safe to ignore them. */ - if (ret == -EBUSY) + if (ret == -EBUSY || ret == -ENOENT) ret = 0; return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 15312013f2a3..aca83a98b75a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -274,8 +274,10 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { + const int abort_error = cur_trans->aborted; + spin_unlock(&fs_info->trans_lock); - return cur_trans->aborted; + return abort_error; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c index 1715d5ca2b2d..e341ade47dd8 100644 --- a/fs/cachefiles/error_inject.c +++ b/fs/cachefiles/error_inject.c @@ -11,7 +11,7 @@ unsigned int cachefiles_error_injection_state; static struct ctl_table_header *cachefiles_sysctl; -static struct ctl_table cachefiles_sysctls[] = { +static const struct ctl_table cachefiles_sysctls[] = { { .procname = "error_injection", .data = &cachefiles_error_injection_state, diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index fdf9dc15eafa..fdd404fc8112 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -412,7 +412,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - char name[100]; + char name[NAME_MAX]; doutc(fsc->client, "begin\n"); fsc->debugfs_congestion_kb = diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0bf388e07a02..62e99e65250d 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1940,29 +1940,19 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry, /* * Check if cached dentry can be trusted. */ -static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) +static int ceph_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; int valid = 0; - struct dentry *parent; - struct inode *dir, *inode; + struct inode *inode; - valid = fscrypt_d_revalidate(dentry, flags); + valid = fscrypt_d_revalidate(dir, name, dentry, flags); if (valid <= 0) return valid; - if (flags & LOOKUP_RCU) { - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - inode = d_inode_rcu(dentry); - } else { - parent = dget_parent(dentry); - dir = d_inode(parent); - inode = d_inode(dentry); - } + inode = d_inode_rcu(dentry); doutc(cl, "%p '%pd' inode %p offset 0x%llx nokey %d\n", dentry, dentry, inode, ceph_dentry(dentry)->offset, @@ -2008,6 +1998,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) req->r_parent = dir; ihold(dir); + req->r_dname = name; + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; if (ceph_security_xattr_wanted(dir)) mask |= CEPH_CAP_XATTR_SHARED; @@ -2038,9 +2030,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) doutc(cl, "%p '%pd' %s\n", dentry, dentry, valid ? "valid" : "invalid"); if (!valid) ceph_dir_clear_complete(dir); - - if (!(flags & LOOKUP_RCU)) - dput(parent); return valid; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 785fe489ef4b..54b3421501e9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2621,6 +2621,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) { struct inode *dir = req->r_parent; struct dentry *dentry = req->r_dentry; + const struct qstr *name = req->r_dname; u8 *cryptbuf = NULL; u32 len = 0; int ret = 0; @@ -2641,8 +2642,10 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) if (!fscrypt_has_encryption_key(dir)) goto success; - if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX, - &len)) { + if (!name) + name = &dentry->d_name; + + if (!fscrypt_fname_encrypted_size(dir, name->len, NAME_MAX, &len)) { WARN_ON_ONCE(1); return ERR_PTR(-ENAMETOOLONG); } @@ -2657,7 +2660,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) if (!cryptbuf) return ERR_PTR(-ENOMEM); - ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len); + ret = fscrypt_fname_encrypt(dir, name, cryptbuf, len); if (ret) { kfree(cryptbuf); return ERR_PTR(ret); @@ -2945,12 +2948,12 @@ static struct ceph_mds_request_head_legacy * find_legacy_request_head(void *p, u64 features) { bool legacy = !(features & CEPH_FEATURE_FS_BTIME); - struct ceph_mds_request_head_old *ohead; + struct ceph_mds_request_head *head; if (legacy) return (struct ceph_mds_request_head_legacy *)p; - ohead = (struct ceph_mds_request_head_old *)p; - return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid; + head = (struct ceph_mds_request_head *)p; + return (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid; } /* @@ -3020,7 +3023,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, if (legacy) len = sizeof(struct ceph_mds_request_head_legacy); else if (request_head_version == 1) - len = sizeof(struct ceph_mds_request_head_old); + len = offsetofend(struct ceph_mds_request_head, args); else if (request_head_version == 2) len = offsetofend(struct ceph_mds_request_head, ext_num_fwd); else @@ -3104,11 +3107,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.version = cpu_to_le16(3); p = msg->front.iov_base + sizeof(*lhead); } else if (request_head_version == 1) { - struct ceph_mds_request_head_old *ohead = msg->front.iov_base; + struct ceph_mds_request_head *nhead = msg->front.iov_base; msg->hdr.version = cpu_to_le16(4); - ohead->version = cpu_to_le16(1); - p = msg->front.iov_base + sizeof(*ohead); + nhead->version = cpu_to_le16(1); + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, args); } else if (request_head_version == 2) { struct ceph_mds_request_head *nhead = msg->front.iov_base; @@ -3265,7 +3268,7 @@ static int __prepare_send_request(struct ceph_mds_session *session, * so we limit to retry at most 256 times. */ if (req->r_attempts) { - old_max_retry = sizeof_field(struct ceph_mds_request_head_old, + old_max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); if ((old_version && req->r_attempts >= old_max_retry) || @@ -5690,18 +5693,18 @@ static int ceph_mds_auth_match(struct ceph_mds_client *mdsc, * * All the other cases --> mismatch */ + bool path_matched = true; char *first = strstr(_tpath, auth->match.path); - if (first != _tpath) { - if (free_tpath) - kfree(_tpath); - return 0; + if (first != _tpath || + (tlen > len && _tpath[len] != '/')) { + path_matched = false; } - if (tlen > len && _tpath[len] != '/') { - if (free_tpath) - kfree(_tpath); + if (free_tpath) + kfree(_tpath); + + if (!path_matched) return 0; - } } } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 38bb7e0d2d79..7c9fee9e80d4 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -299,6 +299,8 @@ struct ceph_mds_request { struct inode *r_target_inode; /* resulting inode */ struct inode *r_new_inode; /* new inode (for creates) */ + const struct qstr *r_dname; /* stable name (for ->d_revalidate) */ + #define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ #define CEPH_MDS_R_ABORTED (2) /* call was aborted */ #define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 4e552ba7bd43..a3e2dfeedfbf 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -445,7 +445,8 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) } /* called when a cache lookup succeeds */ -static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) +static int coda_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *de, unsigned int flags) { struct inode *inode; struct coda_inode_info *cii; diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index 9f2d5743e2c8..0df46f09b6cc 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -14,7 +14,7 @@ static struct ctl_table_header *fs_table_header; -static struct ctl_table coda_table[] = { +static const struct ctl_table coda_table[] = { { .procname = "timeout", .data = &coda_timeout, diff --git a/fs/coredump.c b/fs/coredump.c index d48edb37bc35..591700e1b2ce 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -995,7 +995,7 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write, static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT; static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX; -static struct ctl_table coredump_sysctls[] = { +static const struct ctl_table coredump_sysctls[] = { { .procname = "core_uses_pid", .data = &core_uses_pid, diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 0ad52fbe51c9..010f9c0a4c2f 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -574,11 +574,10 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash); * Validate dentries in encrypted directories to make sure we aren't potentially * caching stale dentries after a key has been added. */ -int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - struct dentry *dir; int err; - int valid; /* * Plaintext names are always valid, since fscrypt doesn't support @@ -591,30 +590,21 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) /* * No-key name; valid if the directory's key is still unavailable. * - * Although fscrypt forbids rename() on no-key names, we still must use - * dget_parent() here rather than use ->d_parent directly. That's - * because a corrupted fs image may contain directory hard links, which - * the VFS handles by moving the directory's dentry tree in the dcache - * each time ->lookup() finds the directory and it already has a dentry - * elsewhere. Thus ->d_parent can be changing, and we must safely grab - * a reference to some ->d_parent to prevent it from being freed. + * Note in RCU mode we have to bail if we get here - + * fscrypt_get_encryption_info() may block. */ if (flags & LOOKUP_RCU) return -ECHILD; - dir = dget_parent(dentry); /* * Pass allow_unsupported=true, so that files with an unsupported * encryption policy can be deleted. */ - err = fscrypt_get_encryption_info(d_inode(dir), true); - valid = !fscrypt_has_encryption_key(d_inode(dir)); - dput(dir); - + err = fscrypt_get_encryption_info(dir, true); if (err < 0) return err; - return valid; + return !fscrypt_has_encryption_key(dir); } EXPORT_SYMBOL_GPL(fscrypt_d_revalidate); diff --git a/fs/dcache.c b/fs/dcache.c index 1a01d7a6a7a9..e3634916ffb9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -192,7 +192,7 @@ static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -static struct ctl_table fs_dcache_sysctls[] = { +static const struct ctl_table fs_dcache_sysctls[] = { { .procname = "dentry-state", .data = &dentry_stat, @@ -295,12 +295,16 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c return dentry_string_cmp(cs, ct, tcount); } +/* + * long names are allocated separately from dentry and never modified. + * Refcounted, freeing is RCU-delayed. See take_dentry_name_snapshot() + * for the reason why ->count and ->head can't be combined into a union. + * dentry_string_cmp() relies upon ->name[] being word-aligned. + */ struct external_name { - union { - atomic_t count; - struct rcu_head head; - } u; - unsigned char name[]; + atomic_t count; + struct rcu_head head; + unsigned char name[] __aligned(sizeof(unsigned long)); }; static inline struct external_name *external_name(struct dentry *dentry) @@ -324,31 +328,45 @@ static void __d_free_external(struct rcu_head *head) static inline int dname_external(const struct dentry *dentry) { - return dentry->d_name.name != dentry->d_iname; + return dentry->d_name.name != dentry->d_shortname.string; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { - spin_lock(&dentry->d_lock); - name->name = dentry->d_name; - if (unlikely(dname_external(dentry))) { - atomic_inc(&external_name(dentry)->u.count); + unsigned seq; + const unsigned char *s; + + rcu_read_lock(); +retry: + seq = read_seqcount_begin(&dentry->d_seq); + s = READ_ONCE(dentry->d_name.name); + name->name.hash_len = dentry->d_name.hash_len; + name->name.name = name->inline_name.string; + if (likely(s == dentry->d_shortname.string)) { + name->inline_name = dentry->d_shortname; } else { - memcpy(name->inline_name, dentry->d_iname, - dentry->d_name.len + 1); - name->name.name = name->inline_name; + struct external_name *p; + p = container_of(s, struct external_name, name[0]); + // get a valid reference + if (unlikely(!atomic_inc_not_zero(&p->count))) + goto retry; + name->name.name = s; } - spin_unlock(&dentry->d_lock); + if (read_seqcount_retry(&dentry->d_seq, seq)) { + release_dentry_name_snapshot(name); + goto retry; + } + rcu_read_unlock(); } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { - if (unlikely(name->name.name != name->inline_name)) { + if (unlikely(name->name.name != name->inline_name.string)) { struct external_name *p; p = container_of(name->name.name, struct external_name, name[0]); - if (unlikely(atomic_dec_and_test(&p->u.count))) - kfree_rcu(p, u.head); + if (unlikely(atomic_dec_and_test(&p->count))) + kfree_rcu(p, head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -386,7 +404,7 @@ static void dentry_free(struct dentry *dentry) WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); - if (likely(atomic_dec_and_test(&p->u.count))) { + if (likely(atomic_dec_and_test(&p->count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } @@ -1654,10 +1672,10 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ - dentry->d_iname[DNAME_INLINE_LEN-1] = 0; + dentry->d_shortname.string[DNAME_INLINE_LEN-1] = 0; if (unlikely(!name)) { name = &slash_name; - dname = dentry->d_iname; + dname = dentry->d_shortname.string; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, @@ -1667,10 +1685,10 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&p->u.count, 1); + atomic_set(&p->count, 1); dname = p->name; } else { - dname = dentry->d_iname; + dname = dentry->d_shortname.string; } dentry->d_name.len = name->len; @@ -1682,7 +1700,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_flags = 0; - lockref_init(&dentry->d_lockref, 1); + lockref_init(&dentry->d_lockref); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; @@ -2728,10 +2746,9 @@ static void swap_names(struct dentry *dentry, struct dentry *target) * dentry:internal, target:external. Steal target's * storage and make target internal. */ - memcpy(target->d_iname, dentry->d_name.name, - dentry->d_name.len + 1); dentry->d_name.name = target->d_name.name; - target->d_name.name = target->d_iname; + target->d_shortname = dentry->d_shortname; + target->d_name.name = target->d_shortname.string; } } else { if (unlikely(dname_external(dentry))) { @@ -2739,20 +2756,16 @@ static void swap_names(struct dentry *dentry, struct dentry *target) * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ - memcpy(dentry->d_iname, target->d_name.name, - target->d_name.len + 1); target->d_name.name = dentry->d_name.name; - dentry->d_name.name = dentry->d_iname; + dentry->d_shortname = target->d_shortname; + dentry->d_name.name = dentry->d_shortname.string; } else { /* * Both are internal. */ - unsigned int i; - BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); - for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { - swap(((long *) &dentry->d_iname)[i], - ((long *) &target->d_iname)[i]); - } + for (int i = 0; i < DNAME_INLINE_WORDS; i++) + swap(dentry->d_shortname.words[i], + target->d_shortname.words[i]); } } swap(dentry->d_name.hash_len, target->d_name.hash_len); @@ -2764,16 +2777,15 @@ static void copy_name(struct dentry *dentry, struct dentry *target) if (unlikely(dname_external(dentry))) old_name = external_name(dentry); if (unlikely(dname_external(target))) { - atomic_inc(&external_name(target)->u.count); + atomic_inc(&external_name(target)->count); dentry->d_name = target->d_name; } else { - memcpy(dentry->d_iname, target->d_name.name, - target->d_name.len + 1); - dentry->d_name.name = dentry->d_iname; + dentry->d_shortname = target->d_shortname; + dentry->d_name.name = dentry->d_shortname.string; dentry->d_name.hash_len = target->d_name.hash_len; } - if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - kfree_rcu(old_name, u.head); + if (old_name && likely(atomic_dec_and_test(&old_name->count))) + kfree_rcu(old_name, head); } /* @@ -2954,7 +2966,12 @@ static int __d_unalias(struct dentry *dentry, struct dentry *alias) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: + if (alias->d_op && alias->d_op->d_unalias_trylock && + !alias->d_op->d_unalias_trylock(alias)) + goto out_err; __d_move(alias, dentry, false); + if (alias->d_op && alias->d_op->d_unalias_unlock) + alias->d_op->d_unalias_unlock(alias); ret = 0; out_err: if (m2) @@ -3102,12 +3119,12 @@ void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; - BUG_ON(dentry->d_name.name != dentry->d_iname || + BUG_ON(dname_external(dentry) || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", + dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); @@ -3195,7 +3212,7 @@ static void __init dcache_init(void) */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, - d_iname); + d_shortname.string); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index e33cc77699cd..69e9ddcb113d 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -94,6 +94,7 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode) fsd = d_fsd; } else { struct inode *inode = dentry->d_inode; + unsigned int methods = 0; if (WARN_ON(mode == DBGFS_GET_ALREADY)) return -EINVAL; @@ -106,25 +107,28 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode) const struct debugfs_short_fops *ops; ops = fsd->short_fops = DEBUGFS_I(inode)->short_fops; if (ops->llseek) - fsd->methods |= HAS_LSEEK; + methods |= HAS_LSEEK; if (ops->read) - fsd->methods |= HAS_READ; + methods |= HAS_READ; if (ops->write) - fsd->methods |= HAS_WRITE; + methods |= HAS_WRITE; + fsd->real_fops = NULL; } else { const struct file_operations *ops; ops = fsd->real_fops = DEBUGFS_I(inode)->real_fops; if (ops->llseek) - fsd->methods |= HAS_LSEEK; + methods |= HAS_LSEEK; if (ops->read) - fsd->methods |= HAS_READ; + methods |= HAS_READ; if (ops->write) - fsd->methods |= HAS_WRITE; + methods |= HAS_WRITE; if (ops->unlocked_ioctl) - fsd->methods |= HAS_IOCTL; + methods |= HAS_IOCTL; if (ops->poll) - fsd->methods |= HAS_POLL; + methods |= HAS_POLL; + fsd->short_fops = NULL; } + fsd->methods = methods; refcount_set(&fsd->active_users, 1); init_completion(&fsd->active_users_drained); INIT_LIST_HEAD(&fsd->cancellations); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index b20e565b9c5e..1096ff8562fa 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -45,7 +45,7 @@ static int pty_limit_min; static int pty_limit_max = INT_MAX; static atomic_t pty_count = ATOMIC_INIT(0); -static struct ctl_table pty_table[] = { +static const struct ctl_table pty_table[] = { { .procname = "max", .maxlen = sizeof(int), diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index acaa0825e9bb..1dfd5b81d831 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -17,7 +17,9 @@ /** * ecryptfs_d_revalidate - revalidate an ecryptfs dentry - * @dentry: The ecryptfs dentry + * @dir: inode of expected parent + * @name: expected name + * @dentry: dentry to revalidate * @flags: lookup flags * * Called when the VFS needs to revalidate a dentry. This @@ -28,7 +30,8 @@ * Returns 1 if valid, 0 otherwise. * */ -static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) +static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); int rc = 1; @@ -36,8 +39,15 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE) - rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); + if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE) { + struct inode *lower_dir = ecryptfs_inode_to_lower(dir); + struct name_snapshot n; + + take_dentry_name_snapshot(&n, lower_dentry); + rc = lower_dentry->d_op->d_revalidate(lower_dir, &n.name, + lower_dentry, flags); + release_dentry_name_snapshot(&n); + } if (d_really_is_positive(dentry)) { struct inode *inode = d_inode(dentry); diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 7940241d9355..df2777e05661 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -407,7 +407,7 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, } it.index = index; - it.name = (struct qstr)QSTR_INIT(name, strlen(name)); + it.name = QSTR(name); if (it.name.len > EROFS_NAME_LEN) return -ERANGE; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 29f8963bb523..d771e06db738 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -726,7 +726,7 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) if (IS_ERR(pcl)) return PTR_ERR(pcl); - lockref_init(&pcl->lockref, 1); /* one ref for this request */ + lockref_init(&pcl->lockref); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index f9898e60dd8b..7c0980db77b3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -318,7 +318,7 @@ static void unlist_file(struct epitems_head *head) static long long_zero; static long long_max = LONG_MAX; -static struct ctl_table epoll_table[] = { +static const struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, diff --git a/fs/exec.c b/fs/exec.c index a49839174472..506cd411f4ac 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2159,7 +2159,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ return error; } -static struct ctl_table fs_exec_sysctls[] = { +static const struct ctl_table fs_exec_sysctls[] = { { .procname = "suid_dumpable", .data = &suid_dumpable, diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 099f80645072..691dd77b6ab5 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -31,10 +31,9 @@ static inline void exfat_d_version_set(struct dentry *dentry, * If it happened, the negative dentry isn't actually negative anymore. So, * drop it. */ -static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags) +static int exfat_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - int ret; - if (flags & LOOKUP_RCU) return -ECHILD; @@ -58,11 +57,7 @@ static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; - spin_lock(&dentry->d_lock); - ret = inode_eq_iversion(d_inode(dentry->d_parent), - exfat_d_version(dentry)); - spin_unlock(&dentry->d_lock); - return ret; + return inode_eq_iversion(dir, exfat_d_version(dentry)); } /* returns the length of a struct qstr, ignoring trailing dots if necessary */ diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 26c4fc37edcf..da4263a14a20 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -322,9 +322,7 @@ restart: WARN_ON(!list_empty(&ei->i_fc_dilist)); spin_unlock(&sbi->s_fc_lock); - if (fc_dentry->fcd_name.name && - fc_dentry->fcd_name.len > DNAME_INLINE_LEN) - kfree(fc_dentry->fcd_name.name); + release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); return; @@ -449,22 +447,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, node->fcd_op = dentry_update->op; node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; - if (dentry->d_name.len > DNAME_INLINE_LEN) { - node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); - if (!node->fcd_name.name) { - kmem_cache_free(ext4_fc_dentry_cachep, node); - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); - mutex_lock(&ei->i_fc_lock); - return -ENOMEM; - } - memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, - dentry->d_name.len); - } else { - memcpy(node->fcd_iname, dentry->d_name.name, - dentry->d_name.len); - node->fcd_name.name = node->fcd_iname; - } - node->fcd_name.len = dentry->d_name.len; + take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); spin_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || @@ -832,7 +815,7 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; - int dlen = fc_dentry->fcd_name.len; + int dlen = fc_dentry->fcd_name.name.len; u8 *dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); @@ -847,7 +830,7 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); - memcpy(dst, fc_dentry->fcd_name.name, dlen); + memcpy(dst, fc_dentry->fcd_name.name.name, dlen); return true; } @@ -1328,9 +1311,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) list_del_init(&fc_dentry->fcd_dilist); spin_unlock(&sbi->s_fc_lock); - if (fc_dentry->fcd_name.name && - fc_dentry->fcd_name.len > DNAME_INLINE_LEN) - kfree(fc_dentry->fcd_name.name); + release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); spin_lock(&sbi->s_fc_lock); } diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 2fadb2c4780c..3bd534e4dbbf 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -109,8 +109,7 @@ struct ext4_fc_dentry_update { int fcd_op; /* Type of update create / unlink / link */ int fcd_parent; /* Parent inode number */ int fcd_ino; /* Inode number */ - struct qstr fcd_name; /* Dirent name */ - unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ + struct name_snapshot fcd_name; /* Dirent name */ struct list_head fcd_list; struct list_head fcd_dilist; }; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 15bf32c21ac0..926c26e90ef8 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -43,17 +43,13 @@ static inline void vfat_d_version_set(struct dentry *dentry, * If it happened, the negative dentry isn't actually negative * anymore. So, drop it. */ -static int vfat_revalidate_shortname(struct dentry *dentry) +static bool vfat_revalidate_shortname(struct dentry *dentry, struct inode *dir) { - int ret = 1; - spin_lock(&dentry->d_lock); - if (!inode_eq_iversion(d_inode(dentry->d_parent), vfat_d_version(dentry))) - ret = 0; - spin_unlock(&dentry->d_lock); - return ret; + return inode_eq_iversion(dir, vfat_d_version(dentry)); } -static int vfat_revalidate(struct dentry *dentry, unsigned int flags) +static int vfat_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; @@ -61,10 +57,11 @@ static int vfat_revalidate(struct dentry *dentry, unsigned int flags) /* This is not negative dentry. Always valid. */ if (d_really_is_positive(dentry)) return 1; - return vfat_revalidate_shortname(dentry); + return vfat_revalidate_shortname(dentry, dir); } -static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags) +static int vfat_revalidate_ci(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; @@ -97,7 +94,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags) if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; - return vfat_revalidate_shortname(dentry); + return vfat_revalidate_shortname(dentry, dir); } /* returns the length of a struct qstr, ignoring trailing dots */ diff --git a/fs/file_table.c b/fs/file_table.c index a32171d2b83f..5c00dc38558d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -106,7 +106,7 @@ static int proc_nr_files(const struct ctl_table *table, int write, void *buffer, return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -static struct ctl_table fs_stat_sysctls[] = { +static const struct ctl_table fs_stat_sysctls[] = { { .procname = "file-nr", .data = &files_stat, @@ -194,6 +194,11 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * refcount bumps we should reinitialize the reused file first. */ file_ref_init(&f->f_ref, 1); + /* + * Disable permission and pre-content events for all files by default. + * They may be enabled later by file_set_fsnotify_mode_from_watchers(). + */ + file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); return 0; } @@ -351,9 +356,7 @@ static struct file *alloc_file(const struct path *path, int flags, static inline int alloc_path_pseudo(const char *name, struct inode *inode, struct vfsmount *mnt, struct path *path) { - struct qstr this = QSTR_INIT(name, strlen(name)); - - path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this); + path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name)); if (!path->dentry) return -ENOMEM; path->mnt = mntget(mnt); @@ -377,7 +380,13 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, if (IS_ERR(file)) { ihold(inode); path_put(&path); + return file; } + /* + * Disable all fsnotify events for pseudo files by default. + * They may be enabled by caller with file_set_fsnotify_mode(). + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL(alloc_file_pseudo); @@ -402,6 +411,11 @@ struct file *alloc_file_pseudo_noaccount(struct inode *inode, return file; } file_init_path(file, &path, fops); + /* + * Disable all fsnotify events for pseudo files by default. + * They may be enabled by caller with file_set_fsnotify_mode(). + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 8674dbfbe59d..ca215a3cba3e 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -63,3 +63,15 @@ config FUSE_PASSTHROUGH to be performed directly on a backing file. If you want to allow passthrough operations, answer Y. + +config FUSE_IO_URING + bool "FUSE communication over io-uring" + default y + depends on FUSE_FS + depends on IO_URING + help + This allows sending FUSE requests over the io-uring interface and + also adds request core affinity. + + If you want to allow fuse server/client communication through io-uring, + answer Y diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 2c372180d631..3f0f312a31c1 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -15,5 +15,6 @@ fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o fuse-$(CONFIG_SYSCTL) += sysctl.o +fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 9abbc2f2894f..0b6ee6dd1fd6 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -240,11 +240,12 @@ static int fuse_send_removemapping(struct inode *inode, args.opcode = FUSE_REMOVEMAPPING; args.nodeid = fi->nodeid; - args.in_numargs = 2; - args.in_args[0].size = sizeof(*inargp); - args.in_args[0].value = inargp; - args.in_args[1].size = inargp->count * sizeof(*remove_one); - args.in_args[1].value = remove_one; + args.in_numargs = 3; + fuse_set_zero_arg0(&args); + args.in_args[1].size = sizeof(*inargp); + args.in_args[1].value = inargp; + args.in_args[2].size = inargp->count * sizeof(*remove_one); + args.in_args[2].value = remove_one; return fuse_simple_request(fm, &args); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 27ccae63495d..5b5f789b37eb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -6,7 +6,9 @@ See the file COPYING. */ +#include "dev_uring_i.h" #include "fuse_i.h" +#include "fuse_dev_i.h" #include <linux/init.h> #include <linux/module.h> @@ -28,23 +30,8 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); -/* Ordinary requests have even IDs, while interrupts IDs are odd */ -#define FUSE_INT_REQ_BIT (1ULL << 0) -#define FUSE_REQ_ID_STEP (1ULL << 1) - static struct kmem_cache *fuse_req_cachep; -static void end_requests(struct list_head *head); - -static struct fuse_dev *fuse_get_dev(struct file *file) -{ - /* - * Lockless access is OK, because file->private data is set - * once during mount and is valid until the file is released. - */ - return READ_ONCE(file->private_data); -} - static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) { INIT_LIST_HEAD(&req->list); @@ -89,7 +76,8 @@ void fuse_set_initialized(struct fuse_conn *fc) static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) { - return !fc->initialized || (for_background && fc->blocked); + return !fc->initialized || (for_background && fc->blocked) || + (fc->io_uring && !fuse_uring_ready(fc)); } static void fuse_drop_waiting(struct fuse_conn *fc) @@ -234,7 +222,7 @@ u64 fuse_get_unique(struct fuse_iqueue *fiq) } EXPORT_SYMBOL_GPL(fuse_get_unique); -static unsigned int fuse_req_hash(u64 unique) +unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } @@ -250,7 +238,8 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } -static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget) +void fuse_dev_queue_forget(struct fuse_iqueue *fiq, + struct fuse_forget_link *forget) { spin_lock(&fiq->lock); if (fiq->connected) { @@ -263,7 +252,7 @@ static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_li } } -static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->lock); if (list_empty(&req->intr_entry)) { @@ -580,7 +569,25 @@ ssize_t __fuse_simple_request(struct mnt_idmap *idmap, return ret; } -static bool fuse_request_queue_background(struct fuse_req *req) +#ifdef CONFIG_FUSE_IO_URING +static bool fuse_request_queue_background_uring(struct fuse_conn *fc, + struct fuse_req *req) +{ + struct fuse_iqueue *fiq = &fc->iq; + + req->in.h.unique = fuse_get_unique(fiq); + req->in.h.len = sizeof(struct fuse_in_header) + + fuse_len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); + + return fuse_uring_queue_bq_req(req); +} +#endif + +/* + * @return true if queued + */ +static int fuse_request_queue_background(struct fuse_req *req) { struct fuse_mount *fm = req->fm; struct fuse_conn *fc = fm->fc; @@ -592,6 +599,12 @@ static bool fuse_request_queue_background(struct fuse_req *req) atomic_inc(&fc->num_waiting); } __set_bit(FR_ISREPLY, &req->flags); + +#ifdef CONFIG_FUSE_IO_URING + if (fuse_uring_ready(fc)) + return fuse_request_queue_background_uring(fc, req); +#endif + spin_lock(&fc->bg_lock); if (likely(fc->connected)) { fc->num_background++; @@ -692,22 +705,8 @@ static int unlock_request(struct fuse_req *req) return err; } -struct fuse_copy_state { - int write; - struct fuse_req *req; - struct iov_iter *iter; - struct pipe_buffer *pipebufs; - struct pipe_buffer *currbuf; - struct pipe_inode_info *pipe; - unsigned long nr_segs; - struct page *pg; - unsigned len; - unsigned offset; - unsigned move_pages:1; -}; - -static void fuse_copy_init(struct fuse_copy_state *cs, int write, - struct iov_iter *iter) +void fuse_copy_init(struct fuse_copy_state *cs, int write, + struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); cs->write = write; @@ -814,6 +813,9 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) *size -= ncpy; cs->len -= ncpy; cs->offset += ncpy; + if (cs->is_uring) + cs->ring.copied_sz += ncpy; + return ncpy; } @@ -1068,9 +1070,9 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) } /* Copy request arguments to/from userspace buffer */ -static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, - unsigned argpages, struct fuse_arg *args, - int zeroing) +int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, + unsigned argpages, struct fuse_arg *args, + int zeroing) { int err = 0; unsigned i; @@ -1760,7 +1762,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, args = &ap->args; args->nodeid = outarg->nodeid; args->opcode = FUSE_NOTIFY_REPLY; - args->in_numargs = 2; + args->in_numargs = 3; args->in_pages = true; args->end = fuse_retrieve_end; @@ -1788,9 +1790,10 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, } ra->inarg.offset = outarg->offset; ra->inarg.size = total_len; - args->in_args[0].size = sizeof(ra->inarg); - args->in_args[0].value = &ra->inarg; - args->in_args[1].size = total_len; + fuse_set_zero_arg0(args); + args->in_args[1].size = sizeof(ra->inarg); + args->in_args[1].value = &ra->inarg; + args->in_args[2].size = total_len; err = fuse_simple_notify_reply(fm, args, outarg->notify_unique); if (err) @@ -1885,7 +1888,7 @@ static void fuse_resend(struct fuse_conn *fc) spin_unlock(&fiq->lock); list_for_each_entry(req, &to_queue, list) clear_bit(FR_PENDING, &req->flags); - end_requests(&to_queue); + fuse_dev_end_requests(&to_queue); return; } /* iq and pq requests are both oldest to newest */ @@ -1934,7 +1937,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, } /* Look up request on processing list by unique ID */ -static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) +struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique) { unsigned int hash = fuse_req_hash(unique); struct fuse_req *req; @@ -1946,10 +1949,17 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) return NULL; } -static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, - unsigned nbytes) +int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, + unsigned nbytes) { - unsigned reqsize = sizeof(struct fuse_out_header); + + unsigned int reqsize = 0; + + /* + * Uring has all headers separated from args - args is payload only + */ + if (!cs->is_uring) + reqsize = sizeof(struct fuse_out_header); reqsize += fuse_len_args(args->out_numargs, args->out_args); @@ -2011,7 +2021,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_lock(&fpq->lock); req = NULL; if (fpq->connected) - req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); + req = fuse_request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); err = -ENOENT; if (!req) { @@ -2049,7 +2059,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, if (oh.error) err = nbytes != sizeof(oh) ? -EINVAL : 0; else - err = copy_out_args(cs, req->args, nbytes); + err = fuse_copy_out_args(cs, req->args, nbytes); fuse_copy_finish(cs); spin_lock(&fpq->lock); @@ -2204,7 +2214,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) } /* Abort all requests on the given list (pending or processing) */ -static void end_requests(struct list_head *head) +void fuse_dev_end_requests(struct list_head *head) { while (!list_empty(head)) { struct fuse_req *req; @@ -2307,7 +2317,13 @@ void fuse_abort_conn(struct fuse_conn *fc) wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); - end_requests(&to_end); + fuse_dev_end_requests(&to_end); + + /* + * fc->lock must not be taken to avoid conflicts with io-uring + * locks + */ + fuse_uring_abort(fc); } else { spin_unlock(&fc->lock); } @@ -2319,6 +2335,8 @@ void fuse_wait_aborted(struct fuse_conn *fc) /* matches implicit memory barrier in fuse_drop_waiting() */ smp_mb(); wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0); + + fuse_uring_wait_stopped_queues(fc); } int fuse_dev_release(struct inode *inode, struct file *file) @@ -2337,7 +2355,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); - end_requests(&to_end); + fuse_dev_end_requests(&to_end); /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { @@ -2475,6 +2493,9 @@ const struct file_operations fuse_dev_operations = { .fasync = fuse_dev_fasync, .unlocked_ioctl = fuse_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, +#ifdef CONFIG_FUSE_IO_URING + .uring_cmd = fuse_uring_cmd, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c new file mode 100644 index 000000000000..ebd2931b4f2a --- /dev/null +++ b/fs/fuse/dev_uring.c @@ -0,0 +1,1319 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2024 DataDirect Networks. + */ + +#include "fuse_i.h" +#include "dev_uring_i.h" +#include "fuse_dev_i.h" + +#include <linux/fs.h> +#include <linux/io_uring/cmd.h> + +static bool __read_mostly enable_uring; +module_param(enable_uring, bool, 0644); +MODULE_PARM_DESC(enable_uring, + "Enable userspace communication through io-uring"); + +#define FUSE_URING_IOV_SEGS 2 /* header and payload */ + + +bool fuse_uring_enabled(void) +{ + return enable_uring; +} + +struct fuse_uring_pdu { + struct fuse_ring_ent *ent; +}; + +static const struct fuse_iqueue_ops fuse_io_uring_ops; + +static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, + struct fuse_ring_ent *ring_ent) +{ + struct fuse_uring_pdu *pdu = + io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); + + pdu->ent = ring_ent; +} + +static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) +{ + struct fuse_uring_pdu *pdu = + io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); + + return pdu->ent; +} + +static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + + lockdep_assert_held(&queue->lock); + lockdep_assert_held(&fc->bg_lock); + + /* + * Allow one bg request per queue, ignoring global fc limits. + * This prevents a single queue from consuming all resources and + * eliminates the need for remote queue wake-ups when global + * limits are met but this queue has no more waiting requests. + */ + while ((fc->active_background < fc->max_background || + !queue->active_background) && + (!list_empty(&queue->fuse_req_bg_queue))) { + struct fuse_req *req; + + req = list_first_entry(&queue->fuse_req_bg_queue, + struct fuse_req, list); + fc->active_background++; + queue->active_background++; + + list_move_tail(&req->list, &queue->fuse_req_queue); + } +} + +static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, + int error) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + + lockdep_assert_not_held(&queue->lock); + spin_lock(&queue->lock); + ent->fuse_req = NULL; + if (test_bit(FR_BACKGROUND, &req->flags)) { + queue->active_background--; + spin_lock(&fc->bg_lock); + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + } + + spin_unlock(&queue->lock); + + if (error) + req->out.h.error = error; + + clear_bit(FR_SENT, &req->flags); + fuse_request_end(req); +} + +/* Abort all list queued request on the given ring queue */ +static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) +{ + struct fuse_req *req; + LIST_HEAD(req_list); + + spin_lock(&queue->lock); + list_for_each_entry(req, &queue->fuse_req_queue, list) + clear_bit(FR_PENDING, &req->flags); + list_splice_init(&queue->fuse_req_queue, &req_list); + spin_unlock(&queue->lock); + + /* must not hold queue lock to avoid order issues with fi->lock */ + fuse_dev_end_requests(&req_list); +} + +void fuse_uring_abort_end_requests(struct fuse_ring *ring) +{ + int qid; + struct fuse_ring_queue *queue; + struct fuse_conn *fc = ring->fc; + + for (qid = 0; qid < ring->nr_queues; qid++) { + queue = READ_ONCE(ring->queues[qid]); + if (!queue) + continue; + + queue->stopped = true; + + WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); + spin_lock(&queue->lock); + spin_lock(&fc->bg_lock); + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + spin_unlock(&queue->lock); + fuse_uring_abort_end_queue_requests(queue); + } +} + +void fuse_uring_destruct(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + int qid; + + if (!ring) + return; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = ring->queues[qid]; + struct fuse_ring_ent *ent, *next; + + if (!queue) + continue; + + WARN_ON(!list_empty(&queue->ent_avail_queue)); + WARN_ON(!list_empty(&queue->ent_w_req_queue)); + WARN_ON(!list_empty(&queue->ent_commit_queue)); + WARN_ON(!list_empty(&queue->ent_in_userspace)); + + list_for_each_entry_safe(ent, next, &queue->ent_released, + list) { + list_del_init(&ent->list); + kfree(ent); + } + + kfree(queue->fpq.processing); + kfree(queue); + ring->queues[qid] = NULL; + } + + kfree(ring->queues); + kfree(ring); + fc->ring = NULL; +} + +/* + * Basic ring setup for this connection based on the provided configuration + */ +static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) +{ + struct fuse_ring *ring; + size_t nr_queues = num_possible_cpus(); + struct fuse_ring *res = NULL; + size_t max_payload_size; + + ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); + if (!ring) + return NULL; + + ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), + GFP_KERNEL_ACCOUNT); + if (!ring->queues) + goto out_err; + + max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); + max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); + + spin_lock(&fc->lock); + if (fc->ring) { + /* race, another thread created the ring in the meantime */ + spin_unlock(&fc->lock); + res = fc->ring; + goto out_err; + } + + init_waitqueue_head(&ring->stop_waitq); + + fc->ring = ring; + ring->nr_queues = nr_queues; + ring->fc = fc; + ring->max_payload_sz = max_payload_size; + atomic_set(&ring->queue_refs, 0); + + spin_unlock(&fc->lock); + return ring; + +out_err: + kfree(ring->queues); + kfree(ring); + return res; +} + +static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, + int qid) +{ + struct fuse_conn *fc = ring->fc; + struct fuse_ring_queue *queue; + struct list_head *pq; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); + if (!queue) + return NULL; + pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); + if (!pq) { + kfree(queue); + return NULL; + } + + queue->qid = qid; + queue->ring = ring; + spin_lock_init(&queue->lock); + + INIT_LIST_HEAD(&queue->ent_avail_queue); + INIT_LIST_HEAD(&queue->ent_commit_queue); + INIT_LIST_HEAD(&queue->ent_w_req_queue); + INIT_LIST_HEAD(&queue->ent_in_userspace); + INIT_LIST_HEAD(&queue->fuse_req_queue); + INIT_LIST_HEAD(&queue->fuse_req_bg_queue); + INIT_LIST_HEAD(&queue->ent_released); + + queue->fpq.processing = pq; + fuse_pqueue_init(&queue->fpq); + + spin_lock(&fc->lock); + if (ring->queues[qid]) { + spin_unlock(&fc->lock); + kfree(queue->fpq.processing); + kfree(queue); + return ring->queues[qid]; + } + + /* + * write_once and lock as the caller mostly doesn't take the lock at all + */ + WRITE_ONCE(ring->queues[qid], queue); + spin_unlock(&fc->lock); + + return queue; +} + +static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) +{ + clear_bit(FR_SENT, &req->flags); + req->out.h.error = -ECONNABORTED; + fuse_request_end(req); +} + +/* + * Release a request/entry on connection tear down + */ +static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) +{ + struct fuse_req *req; + struct io_uring_cmd *cmd; + + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + cmd = ent->cmd; + ent->cmd = NULL; + req = ent->fuse_req; + ent->fuse_req = NULL; + if (req) { + /* remove entry from queue->fpq->processing */ + list_del_init(&req->list); + } + + /* + * The entry must not be freed immediately, due to access of direct + * pointer access of entries through IO_URING_F_CANCEL - there is a risk + * of race between daemon termination (which triggers IO_URING_F_CANCEL + * and accesses entries without checking the list state first + */ + list_move(&ent->list, &queue->ent_released); + ent->state = FRRS_RELEASED; + spin_unlock(&queue->lock); + + if (cmd) + io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); + + if (req) + fuse_uring_stop_fuse_req_end(req); +} + +static void fuse_uring_stop_list_entries(struct list_head *head, + struct fuse_ring_queue *queue, + enum fuse_ring_req_state exp_state) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_ring_ent *ent, *next; + ssize_t queue_refs = SSIZE_MAX; + LIST_HEAD(to_teardown); + + spin_lock(&queue->lock); + list_for_each_entry_safe(ent, next, head, list) { + if (ent->state != exp_state) { + pr_warn("entry teardown qid=%d state=%d expected=%d", + queue->qid, ent->state, exp_state); + continue; + } + + ent->state = FRRS_TEARDOWN; + list_move(&ent->list, &to_teardown); + } + spin_unlock(&queue->lock); + + /* no queue lock to avoid lock order issues */ + list_for_each_entry_safe(ent, next, &to_teardown, list) { + fuse_uring_entry_teardown(ent); + queue_refs = atomic_dec_return(&ring->queue_refs); + WARN_ON_ONCE(queue_refs < 0); + } +} + +static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) +{ + fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, + FRRS_USERSPACE); + fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, + FRRS_AVAILABLE); +} + +/* + * Log state debug info + */ +static void fuse_uring_log_ent_state(struct fuse_ring *ring) +{ + int qid; + struct fuse_ring_ent *ent; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = ring->queues[qid]; + + if (!queue) + continue; + + spin_lock(&queue->lock); + /* + * Log entries from the intermediate queue, the other queues + * should be empty + */ + list_for_each_entry(ent, &queue->ent_w_req_queue, list) { + pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", + ring, qid, ent, ent->state); + } + list_for_each_entry(ent, &queue->ent_commit_queue, list) { + pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", + ring, qid, ent, ent->state); + } + spin_unlock(&queue->lock); + } + ring->stop_debug_log = 1; +} + +static void fuse_uring_async_stop_queues(struct work_struct *work) +{ + int qid; + struct fuse_ring *ring = + container_of(work, struct fuse_ring, async_teardown_work.work); + + /* XXX code dup */ + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); + + if (!queue) + continue; + + fuse_uring_teardown_entries(queue); + } + + /* + * Some ring entries might be in the middle of IO operations, + * i.e. in process to get handled by file_operations::uring_cmd + * or on the way to userspace - we could handle that with conditions in + * run time code, but easier/cleaner to have an async tear down handler + * If there are still queue references left + */ + if (atomic_read(&ring->queue_refs) > 0) { + if (time_after(jiffies, + ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) + fuse_uring_log_ent_state(ring); + + schedule_delayed_work(&ring->async_teardown_work, + FUSE_URING_TEARDOWN_INTERVAL); + } else { + wake_up_all(&ring->stop_waitq); + } +} + +/* + * Stop the ring queues + */ +void fuse_uring_stop_queues(struct fuse_ring *ring) +{ + int qid; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); + + if (!queue) + continue; + + fuse_uring_teardown_entries(queue); + } + + if (atomic_read(&ring->queue_refs) > 0) { + ring->teardown_time = jiffies; + INIT_DELAYED_WORK(&ring->async_teardown_work, + fuse_uring_async_stop_queues); + schedule_delayed_work(&ring->async_teardown_work, + FUSE_URING_TEARDOWN_INTERVAL); + } else { + wake_up_all(&ring->stop_waitq); + } +} + +/* + * Handle IO_URING_F_CANCEL, typically should come on daemon termination. + * + * Releasing the last entry should trigger fuse_dev_release() if + * the daemon was terminated + */ +static void fuse_uring_cancel(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); + struct fuse_ring_queue *queue; + bool need_cmd_done = false; + + /* + * direct access on ent - it must not be destructed as long as + * IO_URING_F_CANCEL might come up + */ + queue = ent->queue; + spin_lock(&queue->lock); + if (ent->state == FRRS_AVAILABLE) { + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + need_cmd_done = true; + ent->cmd = NULL; + } + spin_unlock(&queue->lock); + + if (need_cmd_done) { + /* no queue lock to avoid lock order issues */ + io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); + } +} + +static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, + struct fuse_ring_ent *ring_ent) +{ + uring_cmd_set_ring_ent(cmd, ring_ent); + io_uring_cmd_mark_cancelable(cmd, issue_flags); +} + +/* + * Checks for errors and stores it into the request + */ +static int fuse_uring_out_header_has_err(struct fuse_out_header *oh, + struct fuse_req *req, + struct fuse_conn *fc) +{ + int err; + + err = -EINVAL; + if (oh->unique == 0) { + /* Not supported through io-uring yet */ + pr_warn_once("notify through fuse-io-uring not supported\n"); + goto err; + } + + if (oh->error <= -ERESTARTSYS || oh->error > 0) + goto err; + + if (oh->error) { + err = oh->error; + goto err; + } + + err = -ENOENT; + if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) { + pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n", + req->in.h.unique, + oh->unique & ~FUSE_INT_REQ_BIT); + goto err; + } + + /* + * Is it an interrupt reply ID? + * XXX: Not supported through fuse-io-uring yet, it should not even + * find the request - should not happen. + */ + WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT); + + err = 0; +err: + return err; +} + +static int fuse_uring_copy_from_ring(struct fuse_ring *ring, + struct fuse_req *req, + struct fuse_ring_ent *ent) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct iov_iter iter; + int err; + struct fuse_uring_ent_in_out ring_in_out; + + err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out, + sizeof(ring_in_out)); + if (err) + return -EFAULT; + + err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz, + &iter); + if (err) + return err; + + fuse_copy_init(&cs, 0, &iter); + cs.is_uring = 1; + cs.req = req; + + return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); +} + + /* + * Copy data from the req to the ring buffer + */ +static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, + struct fuse_ring_ent *ent) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct fuse_in_arg *in_args = args->in_args; + int num_args = args->in_numargs; + int err; + struct iov_iter iter; + struct fuse_uring_ent_in_out ent_in_out = { + .flags = 0, + .commit_id = req->in.h.unique, + }; + + err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter); + if (err) { + pr_info_ratelimited("fuse: Import of user buffer failed\n"); + return err; + } + + fuse_copy_init(&cs, 1, &iter); + cs.is_uring = 1; + cs.req = req; + + if (num_args > 0) { + /* + * Expectation is that the first argument is the per op header. + * Some op code have that as zero size. + */ + if (args->in_args[0].size > 0) { + err = copy_to_user(&ent->headers->op_in, in_args->value, + in_args->size); + if (err) { + pr_info_ratelimited( + "Copying the header failed.\n"); + return -EFAULT; + } + } + in_args++; + num_args--; + } + + /* copy the payload */ + err = fuse_copy_args(&cs, num_args, args->in_pages, + (struct fuse_arg *)in_args, 0); + if (err) { + pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); + return err; + } + + ent_in_out.payload_sz = cs.ring.copied_sz; + err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, + sizeof(ent_in_out)); + return err ? -EFAULT : 0; +} + +static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + int err; + + err = -EIO; + if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { + pr_err("qid=%d ring-req=%p invalid state %d on send\n", + queue->qid, ent, ent->state); + return err; + } + + err = -EINVAL; + if (WARN_ON(req->in.h.unique == 0)) + return err; + + /* copy the request */ + err = fuse_uring_args_to_ring(ring, req, ent); + if (unlikely(err)) { + pr_info_ratelimited("Copy to ring failed: %d\n", err); + return err; + } + + /* copy fuse_in_header */ + err = copy_to_user(&ent->headers->in_out, &req->in.h, + sizeof(req->in.h)); + if (err) { + err = -EFAULT; + return err; + } + + return 0; +} + +static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + int err; + + err = fuse_uring_copy_to_ring(ent, req); + if (!err) + set_bit(FR_SENT, &req->flags); + else + fuse_uring_req_end(ent, req, err); + + return err; +} + +/* + * Write data to the ring buffer and send the request to userspace, + * userspace will read it + * This is comparable with classical read(/dev/fuse) + */ +static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, + struct fuse_req *req, + unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + int err; + struct io_uring_cmd *cmd; + + err = fuse_uring_prepare_send(ent, req); + if (err) + return err; + + spin_lock(&queue->lock); + cmd = ent->cmd; + ent->cmd = NULL; + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + spin_unlock(&queue->lock); + + io_uring_cmd_done(cmd, 0, 0, issue_flags); + return 0; +} + +/* + * Make a ring entry available for fuse_req assignment + */ +static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, + struct fuse_ring_queue *queue) +{ + WARN_ON_ONCE(!ent->cmd); + list_move(&ent->list, &queue->ent_avail_queue); + ent->state = FRRS_AVAILABLE; +} + +/* Used to find the request on SQE commit */ +static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_pqueue *fpq = &queue->fpq; + unsigned int hash; + + req->ring_entry = ent; + hash = fuse_req_hash(req->in.h.unique); + list_move_tail(&req->list, &fpq->processing[hash]); +} + +/* + * Assign a fuse queue entry to the given entry + */ +static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_conn *fc = req->fm->fc; + struct fuse_iqueue *fiq = &fc->iq; + + lockdep_assert_held(&queue->lock); + + if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE && + ent->state != FRRS_COMMIT)) { + pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid, + ent->state); + } + + spin_lock(&fiq->lock); + clear_bit(FR_PENDING, &req->flags); + spin_unlock(&fiq->lock); + ent->fuse_req = req; + ent->state = FRRS_FUSE_REQ; + list_move(&ent->list, &queue->ent_w_req_queue); + fuse_uring_add_to_pq(ent, req); +} + +/* Fetch the next fuse request if available */ +static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent) + __must_hold(&queue->lock) +{ + struct fuse_req *req; + struct fuse_ring_queue *queue = ent->queue; + struct list_head *req_queue = &queue->fuse_req_queue; + + lockdep_assert_held(&queue->lock); + + /* get and assign the next entry while it is still holding the lock */ + req = list_first_entry_or_null(req_queue, struct fuse_req, list); + if (req) + fuse_uring_add_req_to_ring_ent(ent, req); + + return req; +} + +/* + * Read data from the ring buffer, which user space has written to + * This is comparible with handling of classical write(/dev/fuse). + * Also make the ring request available again for new fuse requests. + */ +static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req, + unsigned int issue_flags) +{ + struct fuse_ring *ring = ent->queue->ring; + struct fuse_conn *fc = ring->fc; + ssize_t err = 0; + + err = copy_from_user(&req->out.h, &ent->headers->in_out, + sizeof(req->out.h)); + if (err) { + req->out.h.error = -EFAULT; + goto out; + } + + err = fuse_uring_out_header_has_err(&req->out.h, req, fc); + if (err) { + /* req->out.h.error already set */ + goto out; + } + + err = fuse_uring_copy_from_ring(ring, req, ent); +out: + fuse_uring_req_end(ent, req, err); +} + +/* + * Get the next fuse req and send it + */ +static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent, + struct fuse_ring_queue *queue, + unsigned int issue_flags) +{ + int err; + struct fuse_req *req; + +retry: + spin_lock(&queue->lock); + fuse_uring_ent_avail(ent, queue); + req = fuse_uring_ent_assign_req(ent); + spin_unlock(&queue->lock); + + if (req) { + err = fuse_uring_send_next_to_ring(ent, req, issue_flags); + if (err) + goto retry; + } +} + +static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) +{ + struct fuse_ring_queue *queue = ent->queue; + + lockdep_assert_held(&queue->lock); + + if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE)) + return -EIO; + + ent->state = FRRS_COMMIT; + list_move(&ent->list, &queue->ent_commit_queue); + + return 0; +} + +/* FUSE_URING_CMD_COMMIT_AND_FETCH handler */ +static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, + struct fuse_conn *fc) +{ + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); + struct fuse_ring_ent *ent; + int err; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + uint64_t commit_id = READ_ONCE(cmd_req->commit_id); + unsigned int qid = READ_ONCE(cmd_req->qid); + struct fuse_pqueue *fpq; + struct fuse_req *req; + + err = -ENOTCONN; + if (!ring) + return err; + + if (qid >= ring->nr_queues) + return -EINVAL; + + queue = ring->queues[qid]; + if (!queue) + return err; + fpq = &queue->fpq; + + if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) + return err; + + spin_lock(&queue->lock); + /* Find a request based on the unique ID of the fuse request + * This should get revised, as it needs a hash calculation and list + * search. And full struct fuse_pqueue is needed (memory overhead). + * As well as the link from req to ring_ent. + */ + req = fuse_request_find(fpq, commit_id); + err = -ENOENT; + if (!req) { + pr_info("qid=%d commit_id %llu not found\n", queue->qid, + commit_id); + spin_unlock(&queue->lock); + return err; + } + list_del_init(&req->list); + ent = req->ring_entry; + req->ring_entry = NULL; + + err = fuse_ring_ent_set_commit(ent); + if (err != 0) { + pr_info_ratelimited("qid=%d commit_id %llu state %d", + queue->qid, commit_id, ent->state); + spin_unlock(&queue->lock); + req->out.h.error = err; + clear_bit(FR_SENT, &req->flags); + fuse_request_end(req); + return err; + } + + ent->cmd = cmd; + spin_unlock(&queue->lock); + + /* without the queue lock, as other locks are taken */ + fuse_uring_prepare_cancel(cmd, issue_flags, ent); + fuse_uring_commit(ent, req, issue_flags); + + /* + * Fetching the next request is absolutely required as queued + * fuse requests would otherwise not get processed - committing + * and fetching is done in one step vs legacy fuse, which has separated + * read (fetch request) and write (commit result). + */ + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return 0; +} + +static bool is_ring_ready(struct fuse_ring *ring, int current_qid) +{ + int qid; + struct fuse_ring_queue *queue; + bool ready = true; + + for (qid = 0; qid < ring->nr_queues && ready; qid++) { + if (current_qid == qid) + continue; + + queue = ring->queues[qid]; + if (!queue) { + ready = false; + break; + } + + spin_lock(&queue->lock); + if (list_empty(&queue->ent_avail_queue)) + ready = false; + spin_unlock(&queue->lock); + } + + return ready; +} + +/* + * fuse_uring_req_fetch command handling + */ +static void fuse_uring_do_register(struct fuse_ring_ent *ent, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + struct fuse_iqueue *fiq = &fc->iq; + + fuse_uring_prepare_cancel(cmd, issue_flags, ent); + + spin_lock(&queue->lock); + ent->cmd = cmd; + fuse_uring_ent_avail(ent, queue); + spin_unlock(&queue->lock); + + if (!ring->ready) { + bool ready = is_ring_ready(ring, queue->qid); + + if (ready) { + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); + } + } +} + +/* + * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] + * the payload + */ +static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, + struct iovec iov[FUSE_URING_IOV_SEGS]) +{ + struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); + struct iov_iter iter; + ssize_t ret; + + if (sqe->len != FUSE_URING_IOV_SEGS) + return -EINVAL; + + /* + * Direction for buffer access will actually be READ and WRITE, + * using write for the import should include READ access as well. + */ + ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, + FUSE_URING_IOV_SEGS, &iov, &iter); + if (ret < 0) + return ret; + + return 0; +} + +static struct fuse_ring_ent * +fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, + struct fuse_ring_queue *queue) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_ring_ent *ent; + size_t payload_size; + struct iovec iov[FUSE_URING_IOV_SEGS]; + int err; + + err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); + if (err) { + pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", + err); + return ERR_PTR(err); + } + + err = -EINVAL; + if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { + pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); + return ERR_PTR(err); + } + + payload_size = iov[1].iov_len; + if (payload_size < ring->max_payload_sz) { + pr_info_ratelimited("Invalid req payload len %zu\n", + payload_size); + return ERR_PTR(err); + } + + err = -ENOMEM; + ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT); + if (!ent) + return ERR_PTR(err); + + INIT_LIST_HEAD(&ent->list); + + ent->queue = queue; + ent->headers = iov[0].iov_base; + ent->payload = iov[1].iov_base; + + atomic_inc(&ring->queue_refs); + return ent; +} + +/* + * Register header and payload buffer with the kernel and puts the + * entry as "ready to get fuse requests" on the queue + */ +static int fuse_uring_register(struct io_uring_cmd *cmd, + unsigned int issue_flags, struct fuse_conn *fc) +{ + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent; + int err; + unsigned int qid = READ_ONCE(cmd_req->qid); + + err = -ENOMEM; + if (!ring) { + ring = fuse_uring_create(fc); + if (!ring) + return err; + } + + if (qid >= ring->nr_queues) { + pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); + return -EINVAL; + } + + queue = ring->queues[qid]; + if (!queue) { + queue = fuse_uring_create_queue(ring, qid); + if (!queue) + return err; + } + + /* + * The created queue above does not need to be destructed in + * case of entry errors below, will be done at ring destruction time. + */ + + ent = fuse_uring_create_ring_ent(cmd, queue); + if (IS_ERR(ent)) + return PTR_ERR(ent); + + fuse_uring_do_register(ent, cmd, issue_flags); + + return 0; +} + +/* + * Entry function from io_uring to handle the given passthrough command + * (op code IORING_OP_URING_CMD) + */ +int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct fuse_dev *fud; + struct fuse_conn *fc; + u32 cmd_op = cmd->cmd_op; + int err; + + if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { + fuse_uring_cancel(cmd, issue_flags); + return 0; + } + + /* This extra SQE size holds struct fuse_uring_cmd_req */ + if (!(issue_flags & IO_URING_F_SQE128)) + return -EINVAL; + + fud = fuse_get_dev(cmd->file); + if (!fud) { + pr_info_ratelimited("No fuse device found\n"); + return -ENOTCONN; + } + fc = fud->fc; + + /* Once a connection has io-uring enabled on it, it can't be disabled */ + if (!enable_uring && !fc->io_uring) { + pr_info_ratelimited("fuse-io-uring is disabled\n"); + return -EOPNOTSUPP; + } + + if (fc->aborted) + return -ECONNABORTED; + if (!fc->connected) + return -ENOTCONN; + + /* + * fuse_uring_register() needs the ring to be initialized, + * we need to know the max payload size + */ + if (!fc->initialized) + return -EAGAIN; + + switch (cmd_op) { + case FUSE_IO_URING_CMD_REGISTER: + err = fuse_uring_register(cmd, issue_flags, fc); + if (err) { + pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", + err); + fc->io_uring = 0; + wake_up_all(&fc->blocked_waitq); + return err; + } + break; + case FUSE_IO_URING_CMD_COMMIT_AND_FETCH: + err = fuse_uring_commit_fetch(cmd, issue_flags, fc); + if (err) { + pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n", + err); + return err; + } + break; + default: + return -EINVAL; + } + + return -EIOCBQUEUED; +} + +static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, + ssize_t ret, unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + ent->cmd = NULL; + spin_unlock(&queue->lock); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); +} + +/* + * This prepares and sends the ring request in fuse-uring task context. + * User buffers are not mapped yet - the application does not have permission + * to write to it - this has to be executed in ring task context. + */ +static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); + struct fuse_ring_queue *queue = ent->queue; + int err; + + if (!(issue_flags & IO_URING_F_TASK_DEAD)) { + err = fuse_uring_prepare_send(ent, ent->fuse_req); + if (err) { + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return; + } + } else { + err = -ECANCELED; + } + + fuse_uring_send(ent, cmd, err, issue_flags); +} + +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) +{ + unsigned int qid; + struct fuse_ring_queue *queue; + + qid = task_cpu(current); + + if (WARN_ONCE(qid >= ring->nr_queues, + "Core number (%u) exceeds nr queues (%zu)\n", qid, + ring->nr_queues)) + qid = 0; + + queue = ring->queues[qid]; + WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); + + return queue; +} + +static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) +{ + struct io_uring_cmd *cmd = ent->cmd; + + uring_cmd_set_ring_ent(cmd, ent); + io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); +} + +/* queue a fuse request and send it if a ring entry is available */ +void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent = NULL; + int err; + + err = -EINVAL; + queue = fuse_uring_task_to_queue(ring); + if (!queue) + goto err; + + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + + spin_lock(&queue->lock); + err = -ENOTCONN; + if (unlikely(queue->stopped)) + goto err_unlock; + + ent = list_first_entry_or_null(&queue->ent_avail_queue, + struct fuse_ring_ent, list); + if (ent) + fuse_uring_add_req_to_ring_ent(ent, req); + else + list_add_tail(&req->list, &queue->fuse_req_queue); + spin_unlock(&queue->lock); + + if (ent) + fuse_uring_dispatch_ent(ent); + + return; + +err_unlock: + spin_unlock(&queue->lock); +err: + req->out.h.error = err; + clear_bit(FR_PENDING, &req->flags); + fuse_request_end(req); +} + +bool fuse_uring_queue_bq_req(struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent = NULL; + + queue = fuse_uring_task_to_queue(ring); + if (!queue) + return false; + + spin_lock(&queue->lock); + if (unlikely(queue->stopped)) { + spin_unlock(&queue->lock); + return false; + } + + list_add_tail(&req->list, &queue->fuse_req_bg_queue); + + ent = list_first_entry_or_null(&queue->ent_avail_queue, + struct fuse_ring_ent, list); + spin_lock(&fc->bg_lock); + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + + /* + * Due to bg_queue flush limits there might be other bg requests + * in the queue that need to be handled first. Or no further req + * might be available. + */ + req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, + list); + if (ent && req) { + fuse_uring_add_req_to_ring_ent(ent, req); + spin_unlock(&queue->lock); + + fuse_uring_dispatch_ent(ent); + } else { + spin_unlock(&queue->lock); + } + + return true; +} + +static const struct fuse_iqueue_ops fuse_io_uring_ops = { + /* should be send over io-uring as enhancement */ + .send_forget = fuse_dev_queue_forget, + + /* + * could be send over io-uring, but interrupts should be rare, + * no need to make the code complex + */ + .send_interrupt = fuse_dev_queue_interrupt, + .send_req = fuse_uring_queue_fuse_req, +}; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h new file mode 100644 index 000000000000..2102b3d0c1ae --- /dev/null +++ b/fs/fuse/dev_uring_i.h @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2024 DataDirect Networks. + */ + +#ifndef _FS_FUSE_DEV_URING_I_H +#define _FS_FUSE_DEV_URING_I_H + +#include "fuse_i.h" + +#ifdef CONFIG_FUSE_IO_URING + +#define FUSE_URING_TEARDOWN_TIMEOUT (5 * HZ) +#define FUSE_URING_TEARDOWN_INTERVAL (HZ/20) + +enum fuse_ring_req_state { + FRRS_INVALID = 0, + + /* The ring entry received from userspace and it is being processed */ + FRRS_COMMIT, + + /* The ring entry is waiting for new fuse requests */ + FRRS_AVAILABLE, + + /* The ring entry got assigned a fuse req */ + FRRS_FUSE_REQ, + + /* The ring entry is in or on the way to user space */ + FRRS_USERSPACE, + + /* The ring entry is in teardown */ + FRRS_TEARDOWN, + + /* The ring entry is released, but not freed yet */ + FRRS_RELEASED, +}; + +/** A fuse ring entry, part of the ring queue */ +struct fuse_ring_ent { + /* userspace buffer */ + struct fuse_uring_req_header __user *headers; + void __user *payload; + + /* the ring queue that owns the request */ + struct fuse_ring_queue *queue; + + /* fields below are protected by queue->lock */ + + struct io_uring_cmd *cmd; + + struct list_head list; + + enum fuse_ring_req_state state; + + struct fuse_req *fuse_req; +}; + +struct fuse_ring_queue { + /* + * back pointer to the main fuse uring structure that holds this + * queue + */ + struct fuse_ring *ring; + + /* queue id, corresponds to the cpu core */ + unsigned int qid; + + /* + * queue lock, taken when any value in the queue changes _and_ also + * a ring entry state changes. + */ + spinlock_t lock; + + /* available ring entries (struct fuse_ring_ent) */ + struct list_head ent_avail_queue; + + /* + * entries in the process of being committed or in the process + * to be sent to userspace + */ + struct list_head ent_w_req_queue; + struct list_head ent_commit_queue; + + /* entries in userspace */ + struct list_head ent_in_userspace; + + /* entries that are released */ + struct list_head ent_released; + + /* fuse requests waiting for an entry slot */ + struct list_head fuse_req_queue; + + /* background fuse requests */ + struct list_head fuse_req_bg_queue; + + struct fuse_pqueue fpq; + + unsigned int active_background; + + bool stopped; +}; + +/** + * Describes if uring is for communication and holds alls the data needed + * for uring communication + */ +struct fuse_ring { + /* back pointer */ + struct fuse_conn *fc; + + /* number of ring queues */ + size_t nr_queues; + + /* maximum payload/arg size */ + size_t max_payload_sz; + + struct fuse_ring_queue **queues; + + /* + * Log ring entry states on stop when entries cannot be released + */ + unsigned int stop_debug_log : 1; + + wait_queue_head_t stop_waitq; + + /* async tear down */ + struct delayed_work async_teardown_work; + + /* log */ + unsigned long teardown_time; + + atomic_t queue_refs; + + bool ready; +}; + +bool fuse_uring_enabled(void); +void fuse_uring_destruct(struct fuse_conn *fc); +void fuse_uring_stop_queues(struct fuse_ring *ring); +void fuse_uring_abort_end_requests(struct fuse_ring *ring); +int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); +bool fuse_uring_queue_bq_req(struct fuse_req *req); + +static inline void fuse_uring_abort(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + + if (ring == NULL) + return; + + if (atomic_read(&ring->queue_refs) > 0) { + fuse_uring_abort_end_requests(ring); + fuse_uring_stop_queues(ring); + } +} + +static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + + if (ring) + wait_event(ring->stop_waitq, + atomic_read(&ring->queue_refs) == 0); +} + +static inline bool fuse_uring_ready(struct fuse_conn *fc) +{ + return fc->ring && fc->ring->ready; +} + +#else /* CONFIG_FUSE_IO_URING */ + +struct fuse_ring; + +static inline void fuse_uring_create(struct fuse_conn *fc) +{ +} + +static inline void fuse_uring_destruct(struct fuse_conn *fc) +{ +} + +static inline bool fuse_uring_enabled(void) +{ + return false; +} + +static inline void fuse_uring_abort(struct fuse_conn *fc) +{ +} + +static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) +{ +} + +static inline bool fuse_uring_ready(struct fuse_conn *fc) +{ + return false; +} + +#endif /* CONFIG_FUSE_IO_URING */ + +#endif /* _FS_FUSE_DEV_URING_I_H */ diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bf057cf7098d..198862b086ff 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -175,9 +175,12 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, memset(outarg, 0, sizeof(struct fuse_entry_out)); args->opcode = FUSE_LOOKUP; args->nodeid = nodeid; - args->in_numargs = 1; - args->in_args[0].size = name->len + 1; - args->in_args[0].value = name->name; + args->in_numargs = 3; + fuse_set_zero_arg0(args); + args->in_args[1].size = name->len; + args->in_args[1].value = name->name; + args->in_args[2].size = 1; + args->in_args[2].value = ""; args->out_numargs = 1; args->out_args[0].size = sizeof(struct fuse_entry_out); args->out_args[0].value = outarg; @@ -192,10 +195,10 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, * the lookup once more. If the lookup results in the same inode, * then refresh the attributes, timeouts and mark the dentry valid. */ -static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) +static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *entry, unsigned int flags) { struct inode *inode; - struct dentry *parent; struct fuse_mount *fm; struct fuse_inode *fi; int ret; @@ -227,11 +230,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version = fuse_get_attr_version(fm->fc); - parent = dget_parent(entry); - fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), - &entry->d_name, &outarg); + fuse_lookup_init(fm->fc, &args, get_node_id(dir), + name, &outarg); ret = fuse_simple_request(fm, &args); - dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; @@ -265,9 +266,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state)) return -ECHILD; } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { - parent = dget_parent(entry); - fuse_advise_use_readdirplus(d_inode(parent)); - dput(parent); + fuse_advise_use_readdirplus(dir); } } ret = 1; @@ -929,11 +928,12 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, FUSE_ARGS(args); args.opcode = FUSE_SYMLINK; - args.in_numargs = 2; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; - args.in_args[1].size = len; - args.in_args[1].value = link; + args.in_numargs = 3; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + args.in_args[2].size = len; + args.in_args[2].value = link; return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK); } @@ -993,9 +993,10 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.opcode = FUSE_UNLINK; args.nodeid = get_node_id(dir); - args.in_numargs = 1; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); @@ -1016,9 +1017,10 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.opcode = FUSE_RMDIR; args.nodeid = get_node_id(dir); - args.in_numargs = 1; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h new file mode 100644 index 000000000000..3b2bfe1248d3 --- /dev/null +++ b/fs/fuse/fuse_dev_i.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> + */ +#ifndef _FS_FUSE_DEV_I_H +#define _FS_FUSE_DEV_I_H + +#include <linux/types.h> + +/* Ordinary requests have even IDs, while interrupts IDs are odd */ +#define FUSE_INT_REQ_BIT (1ULL << 0) +#define FUSE_REQ_ID_STEP (1ULL << 1) + +struct fuse_arg; +struct fuse_args; +struct fuse_pqueue; +struct fuse_req; +struct fuse_iqueue; +struct fuse_forget_link; + +struct fuse_copy_state { + int write; + struct fuse_req *req; + struct iov_iter *iter; + struct pipe_buffer *pipebufs; + struct pipe_buffer *currbuf; + struct pipe_inode_info *pipe; + unsigned long nr_segs; + struct page *pg; + unsigned int len; + unsigned int offset; + unsigned int move_pages:1; + unsigned int is_uring:1; + struct { + unsigned int copied_sz; /* copied size into the user buffer */ + } ring; +}; + +static inline struct fuse_dev *fuse_get_dev(struct file *file) +{ + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return READ_ONCE(file->private_data); +} + +unsigned int fuse_req_hash(u64 unique); +struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); + +void fuse_dev_end_requests(struct list_head *head); + +void fuse_copy_init(struct fuse_copy_state *cs, int write, + struct iov_iter *iter); +int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, + unsigned int argpages, struct fuse_arg *args, + int zeroing); +int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, + unsigned int nbytes); +void fuse_dev_queue_forget(struct fuse_iqueue *fiq, + struct fuse_forget_link *forget); +void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); + +#endif + diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 74744c6f2860..fee96fe7887b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -310,7 +310,7 @@ struct fuse_args { bool is_ext:1; bool is_pinned:1; bool invalidate_vmap:1; - struct fuse_in_arg in_args[3]; + struct fuse_in_arg in_args[4]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); /* Used for kvec iter backed by vmalloc address */ @@ -438,6 +438,10 @@ struct fuse_req { /** fuse_mount this request belongs to */ struct fuse_mount *fm; + +#ifdef CONFIG_FUSE_IO_URING + void *ring_entry; +#endif }; struct fuse_iqueue; @@ -863,6 +867,9 @@ struct fuse_conn { /* Use pages instead of pointer for kernel I/O */ unsigned int use_pages_for_kvec_io:1; + /* Use io_uring for communication */ + unsigned int io_uring; + /** Maximum stack depth for passthrough backing files */ int max_stack_depth; @@ -923,6 +930,11 @@ struct fuse_conn { /** IDR for backing files ids */ struct idr backing_files_map; #endif + +#ifdef CONFIG_FUSE_IO_URING + /** uring connection information*/ + struct fuse_ring *ring; +#endif }; /* @@ -947,6 +959,19 @@ struct fuse_mount { struct rcu_head rcu; }; +/* + * Empty header for FUSE opcodes without specific header needs. + * Used as a placeholder in args->in_args[0] for consistency + * across all FUSE operations, simplifying request handling. + */ +struct fuse_zero_header {}; + +static inline void fuse_set_zero_arg0(struct fuse_args *args) +{ + args->in_args[0].size = sizeof(struct fuse_zero_header); + args->in_args[0].value = NULL; +} + static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) { return sb->s_fs_info; @@ -1220,6 +1245,11 @@ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** + * Initialize the fuse processing queue + */ +void fuse_pqueue_init(struct fuse_pqueue *fpq); + +/** * Initialize fuse_conn */ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3ce4f4e81d09..e9db2cb8c150 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "dev_uring_i.h" #include <linux/pagemap.h> #include <linux/slab.h> @@ -937,7 +938,7 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq, fiq->priv = priv; } -static void fuse_pqueue_init(struct fuse_pqueue *fpq) +void fuse_pqueue_init(struct fuse_pqueue *fpq) { unsigned int i; @@ -992,6 +993,8 @@ static void delayed_release(struct rcu_head *p) { struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); + fuse_uring_destruct(fc); + put_user_ns(fc->user_ns); fc->release(fc); } @@ -1387,6 +1390,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, else ok = false; } + if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) + fc->io_uring = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1446,6 +1451,13 @@ void fuse_send_init(struct fuse_mount *fm) if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) flags |= FUSE_PASSTHROUGH; + /* + * This is just an information flag for fuse server. No need to check + * the reply - server is either sending IORING_OP_URING_CMD or not. + */ + if (fuse_uring_enabled()) + flags |= FUSE_OVER_IO_URING; + ia->in.flags = flags; ia->in.flags2 = flags >> 32; diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c index b272bb333005..63fb1e5bee30 100644 --- a/fs/fuse/sysctl.c +++ b/fs/fuse/sysctl.c @@ -13,7 +13,7 @@ static struct ctl_table_header *fuse_table_header; /* Bound by fuse_init_out max_pages, which is a u16 */ static unsigned int sysctl_fuse_max_pages_limit = 65535; -static struct ctl_table fuse_sysctl_table[] = { +static const struct ctl_table fuse_sysctl_table[] = { { .procname = "max_pages_limit", .data = &fuse_max_pages_limit, diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 9f568d345c51..93dfb06b6cea 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -164,9 +164,10 @@ int fuse_removexattr(struct inode *inode, const char *name) args.opcode = FUSE_REMOVEXATTR; args.nodeid = get_node_id(inode); - args.in_numargs = 1; - args.in_args[0].size = strlen(name) + 1; - args.in_args[0].value = name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_removexattr = 1; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 2e215e8c3c88..95050e719233 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -21,7 +21,9 @@ /** * gfs2_drevalidate - Check directory lookup consistency - * @dentry: the mapping to check + * @dir: expected parent directory inode + * @name: expexted name + * @dentry: dentry to check * @flags: lookup flags * * Check to make sure the lookup necessary to arrive at this inode from its @@ -30,50 +32,43 @@ * Returns: 1 if the dentry is ok, 0 if it isn't */ -static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) +static int gfs2_drevalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - struct dentry *parent; - struct gfs2_sbd *sdp; - struct gfs2_inode *dip; + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_inode *dip = GFS2_I(dir); struct inode *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; - int error, valid = 0; + int error, valid; int had_lock = 0; if (flags & LOOKUP_RCU) return -ECHILD; - parent = dget_parent(dentry); - sdp = GFS2_SB(d_inode(parent)); - dip = GFS2_I(d_inode(parent)); inode = d_inode(dentry); if (inode) { if (is_bad_inode(inode)) - goto out; + return 0; ip = GFS2_I(inode); } - if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) { - valid = 1; - goto out; - } + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + return 1; had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); if (!had_lock) { error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) - goto out; + return 0; } - error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip); + error = gfs2_dir_check(dir, name, ip); valid = inode ? !error : (error == -ENOENT); if (!had_lock) gfs2_glock_dq_uninit(&d_gh); -out: - dput(parent); return valid; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 8c4c1f871a88..65c07aa95718 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1201,8 +1201,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, if (glops->go_instantiate) gl->gl_flags |= BIT(GLF_INSTANTIATE_NEEDED); gl->gl_name = name; + lockref_init(&gl->gl_lockref); lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass); - gl->gl_lockref.count = 1; gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 04cadc02e5a6..0727f60ad028 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -51,7 +51,6 @@ static void gfs2_init_glock_once(void *foo) { struct gfs2_glock *gl = foo; - spin_lock_init(&gl->gl_lockref.lock); INIT_LIST_HEAD(&gl->gl_holders); INIT_LIST_HEAD(&gl->gl_lru); INIT_LIST_HEAD(&gl->gl_ail_list); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 58bc5013ca49..2298e06797ac 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -236,7 +236,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str return NULL; qd->qd_sbd = sdp; - lockref_init(&qd->qd_lockref, 0); + lockref_init(&qd->qd_lockref); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru); @@ -297,7 +297,6 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, spin_lock_bucket(hash); *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); if (qd == NULL) { - new_qd->qd_lockref.count++; *qdp = new_qd; list_add(&new_qd->qd_list, &sdp->sd_quota_list); hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); @@ -1450,6 +1449,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) if (qd == NULL) goto fail_brelse; + qd->qd_lockref.count = 0; set_bit(QDF_CHANGE, &qd->qd_flags); qd->qd_change = qc_change; qd->qd_slot = slot; diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index 76fa02e3835b..ef54fc8093cf 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -13,7 +13,8 @@ /* dentry case-handling: just lowercase everything */ -static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags) +static int hfs_revalidate_dentry(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; int diff; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 7e51d2cec64b..e0741e468956 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -95,32 +95,17 @@ __uml_setup("hostfs=", hostfs_args, static char *__dentry_name(struct dentry *dentry, char *name) { char *p = dentry_path_raw(dentry, name, PATH_MAX); - char *root; - size_t len; - struct hostfs_fs_info *fsi; - - fsi = dentry->d_sb->s_fs_info; - root = fsi->host_root_path; - len = strlen(root); - if (IS_ERR(p)) { - __putname(name); - return NULL; - } + struct hostfs_fs_info *fsi = dentry->d_sb->s_fs_info; + char *root = fsi->host_root_path; + size_t len = strlen(root); - /* - * This function relies on the fact that dentry_path_raw() will place - * the path name at the end of the provided buffer. - */ - BUG_ON(p + strlen(p) + 1 != name + PATH_MAX); - - strscpy(name, root, PATH_MAX); - if (len > p - name) { + if (IS_ERR(p) || len > p - name) { __putname(name); return NULL; } - if (p > name + len) - strcpy(name + len, p); + memcpy(name, root, len); + memmove(name + len, p, name + PATH_MAX - p); return name; } @@ -410,38 +395,33 @@ static const struct file_operations hostfs_dir_fops = { .fsync = hostfs_fsync, }; -static int hostfs_writepage(struct page *page, struct writeback_control *wbc) +static int hostfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - char *buffer; - loff_t base = page_offset(page); - int count = PAGE_SIZE; - int end_index = inode->i_size >> PAGE_SHIFT; - int err; - - if (page->index >= end_index) - count = inode->i_size & (PAGE_SIZE-1); - - buffer = kmap_local_page(page); - - err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count); - if (err != count) { - if (err >= 0) - err = -EIO; - mapping_set_error(mapping, err); - goto out; + struct folio *folio = NULL; + loff_t i_size = i_size_read(inode); + int err = 0; + + while ((folio = writeback_iter(mapping, wbc, folio, &err))) { + loff_t pos = folio_pos(folio); + size_t count = folio_size(folio); + char *buffer; + int ret; + + if (count > i_size - pos) + count = i_size - pos; + + buffer = kmap_local_folio(folio, 0); + ret = write_file(HOSTFS_I(inode)->fd, &pos, buffer, count); + kunmap_local(buffer); + folio_unlock(folio); + if (ret != count) { + err = ret < 0 ? ret : -EIO; + mapping_set_error(mapping, err); + } } - if (base > inode->i_size) - inode->i_size = base; - - err = 0; - - out: - kunmap_local(buffer); - unlock_page(page); - return err; } @@ -506,11 +486,12 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, } static const struct address_space_operations hostfs_aops = { - .writepage = hostfs_writepage, + .writepages = hostfs_writepages, .read_folio = hostfs_read_folio, .dirty_folio = filemap_dirty_folio, .write_begin = hostfs_write_begin, .write_end = hostfs_write_end, + .migrate_folio = filemap_migrate_folio, }; static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st) diff --git a/fs/inode.c b/fs/inode.c index 6b4c77268fc0..5587aabdaa5e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -184,7 +184,7 @@ static int proc_nr_inodes(const struct ctl_table *table, int write, void *buffer return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -static struct ctl_table inodes_sysctls[] = { +static const struct ctl_table inodes_sysctls[] = { { .procname = "inode-nr", .data = &inodes_stat, diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index d68a4e6ac345..fc8ede43afde 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1576,7 +1576,8 @@ out: return result; } -static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags) +static int jfs_ci_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { /* * This is not negative dentry. Always valid. diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 458519e416fe..5f0f8b95f44c 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1109,7 +1109,8 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, return ERR_PTR(rc); } -static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) +static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn; struct kernfs_root *root; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 8502ef68459b..0eb320617d7b 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -927,7 +927,7 @@ repeat: if (!inode) continue; - name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name)); + name = QSTR(kn->name); parent = kernfs_get_parent(kn); if (parent) { p_inode = ilookup(info->sb, kernfs_ino(parent)); diff --git a/fs/libfs.c b/fs/libfs.c index 5b6120b19e99..8444f5cc4064 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1782,7 +1782,7 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, { const struct dentry *parent; const struct inode *dir; - char strbuf[DNAME_INLINE_LEN]; + union shortname_store strbuf; struct qstr qstr; /* @@ -1802,22 +1802,23 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, if (!dir || !IS_CASEFOLDED(dir)) return 1; + qstr.len = len; + qstr.name = str; /* * If the dentry name is stored in-line, then it may be concurrently * modified by a rename. If this happens, the VFS will eventually retry * the lookup, so it doesn't matter what ->d_compare() returns. * However, it's unsafe to call utf8_strncasecmp() with an unstable * string. Therefore, we have to copy the name into a temporary buffer. + * As above, len is guaranteed to match str, so the shortname case + * is exactly when str points to ->d_shortname. */ - if (len <= DNAME_INLINE_LEN - 1) { - memcpy(strbuf, str, len); - strbuf[len] = 0; - str = strbuf; + if (qstr.name == dentry->d_shortname.string) { + strbuf = dentry->d_shortname; // NUL is guaranteed to be in there + qstr.name = strbuf.string; /* prevent compiler from optimizing out the temporary buffer */ barrier(); } - qstr.len = len; - qstr.name = str; return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 7ded57ec3a60..2c8eedc6c2cc 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -412,7 +412,7 @@ EXPORT_SYMBOL_GPL(lockd_down); * Sysctl parameters (same as module parameters, different interface). */ -static struct ctl_table nlm_sysctls[] = { +static const struct ctl_table nlm_sysctls[] = { { .procname = "nlm_grace_period", .data = &nlm_grace_period, diff --git a/fs/locks.c b/fs/locks.c index 25afc8d9c9d1..1619cddfa7a4 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -97,7 +97,7 @@ static int leases_enable = 1; static int lease_break_time = 45; #ifdef CONFIG_SYSCTL -static struct ctl_table locks_sysctls[] = { +static const struct ctl_table locks_sysctls[] = { { .procname = "leases-enable", .data = &leases_enable, diff --git a/fs/namei.c b/fs/namei.c index e56c29a22d26..3ab9440c5b93 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -921,10 +921,11 @@ out_dput: return false; } -static inline int d_revalidate(struct dentry *dentry, unsigned int flags) +static inline int d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) - return dentry->d_op->d_revalidate(dentry, flags); + return dentry->d_op->d_revalidate(dir, name, dentry, flags); else return 1; } @@ -1099,7 +1100,7 @@ static int sysctl_protected_fifos __read_mostly; static int sysctl_protected_regular __read_mostly; #ifdef CONFIG_SYSCTL -static struct ctl_table namei_sysctls[] = { +static const struct ctl_table namei_sysctls[] = { { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, @@ -1652,7 +1653,7 @@ static struct dentry *lookup_dcache(const struct qstr *name, { struct dentry *dentry = d_lookup(dir, name); if (dentry) { - int error = d_revalidate(dentry, flags); + int error = d_revalidate(dir->d_inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) d_invalidate(dentry); @@ -1737,19 +1738,20 @@ static struct dentry *lookup_fast(struct nameidata *nd) if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); - status = d_revalidate(dentry, nd->flags); + status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); if (likely(status > 0)) return dentry; if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ - status = d_revalidate(dentry, nd->flags); + status = d_revalidate(nd->inode, &nd->last, + dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return NULL; - status = d_revalidate(dentry, nd->flags); + status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) @@ -1777,7 +1779,7 @@ again: if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { - int error = d_revalidate(dentry, flags); + int error = d_revalidate(inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) { d_invalidate(dentry); @@ -3575,7 +3577,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, if (d_in_lookup(dentry)) break; - error = d_revalidate(dentry, nd->flags); + error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags); if (likely(error > 0)) break; if (error) diff --git a/fs/namespace.c b/fs/namespace.c index 4013fbac354a..8f1000f9f3df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5087,30 +5087,29 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; int err; - if (sb->s_op->show_options) { - size_t start = seq->count; - - err = security_sb_show_options(seq, sb); - if (err) - return err; + err = security_sb_show_options(seq, sb); + if (err) + return err; + if (sb->s_op->show_options) { err = sb->s_op->show_options(seq, mnt->mnt_root); if (err) return err; + } - if (unlikely(seq_has_overflowed(seq))) - return -EAGAIN; + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; - if (seq->count == start) - return 0; + if (seq->count == start) + return 0; - /* skip leading comma */ - memmove(seq->buf + start, seq->buf + start + 1, - seq->count - start - 1); - seq->count--; - } + /* skip leading comma */ + memmove(seq->buf + start, seq->buf + start + 1, + seq->count - start - 1); + seq->count--; return 0; } @@ -5191,39 +5190,45 @@ static int statmount_string(struct kstatmount *s, u64 flag) size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; - u32 start = seq->count; + u32 start, *offp; + + /* Reserve an empty string at the beginning for any unset offsets */ + if (!seq->count) + seq_putc(seq, 0); + + start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: - sm->fs_type = start; + offp = &sm->fs_type; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: - sm->mnt_root = start; + offp = &sm->mnt_root; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: - sm->mnt_point = start; + offp = &sm->mnt_point; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: - sm->mnt_opts = start; + offp = &sm->mnt_opts; ret = statmount_mnt_opts(s, seq); break; case STATMOUNT_OPT_ARRAY: - sm->opt_array = start; + offp = &sm->opt_array; ret = statmount_opt_array(s, seq); break; case STATMOUNT_OPT_SEC_ARRAY: - sm->opt_sec_array = start; + offp = &sm->opt_sec_array; ret = statmount_opt_sec_array(s, seq); break; case STATMOUNT_FS_SUBTYPE: - sm->fs_subtype = start; + offp = &sm->fs_subtype; statmount_fs_subtype(s, seq); break; case STATMOUNT_SB_SOURCE: - sm->sb_source = start; + offp = &sm->sb_source; ret = statmount_sb_source(s, seq); break; default: @@ -5251,6 +5256,7 @@ static int statmount_string(struct kstatmount *s, u64 flag) seq->buf[seq->count++] = '\0'; sm->mask |= flag; + *offp = start; return 0; } @@ -5985,7 +5991,7 @@ const struct proc_ns_operations mntns_operations = { }; #ifdef CONFIG_SYSCTL -static struct ctl_table fs_namespace_sysctls[] = { +static const struct ctl_table fs_namespace_sysctls[] = { { .procname = "mount-max", .data = &sysctl_mount_max, diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index f761d44b3436..0d1b6d35ff3b 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -155,8 +155,9 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, netfs_cache_read_terminated, subreq); } -static void netfs_issue_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) +static void netfs_queue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq, + bool last_subreq) { struct netfs_io_stream *stream = &rreq->io_streams[0]; @@ -177,8 +178,17 @@ static void netfs_issue_read(struct netfs_io_request *rreq, } } + if (last_subreq) { + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); + } + spin_unlock(&rreq->lock); +} +static void netfs_issue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ switch (subreq->source) { case NETFS_DOWNLOAD_FROM_SERVER: rreq->netfs_ops->issue_read(subreq); @@ -293,11 +303,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) } size -= slice; start += slice; - if (size <= 0) { - smp_wmb(); /* Write lists before ALL_QUEUED. */ - set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); - } + netfs_queue_read(rreq, subreq, size <= 0); netfs_issue_read(rreq, subreq); cond_resched(); } while (size > 0); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index eb76f98c894b..1c4f953c3d68 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -135,6 +135,8 @@ extern atomic_t netfs_n_rh_write_begin; extern atomic_t netfs_n_rh_write_done; extern atomic_t netfs_n_rh_write_failed; extern atomic_t netfs_n_rh_write_zskip; +extern atomic_t netfs_n_rh_retry_read_req; +extern atomic_t netfs_n_rh_retry_read_subreq; extern atomic_t netfs_n_wh_buffered_write; extern atomic_t netfs_n_wh_writethrough; extern atomic_t netfs_n_wh_dio_write; @@ -147,6 +149,8 @@ extern atomic_t netfs_n_wh_upload_failed; extern atomic_t netfs_n_wh_write; extern atomic_t netfs_n_wh_write_done; extern atomic_t netfs_n_wh_write_failed; +extern atomic_t netfs_n_wh_retry_write_req; +extern atomic_t netfs_n_wh_retry_write_subreq; extern atomic_t netfs_n_wb_lock_skip; extern atomic_t netfs_n_wb_lock_wait; extern atomic_t netfs_n_folioq; diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index f65affa5a9e4..636cc5a98ef5 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -470,7 +470,8 @@ void netfs_read_collection_worker(struct work_struct *work) */ void netfs_wake_read_collector(struct netfs_io_request *rreq) { - if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && + !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { if (!work_pending(&rreq->work)) { netfs_get_request(rreq, netfs_rreq_trace_get_work); if (!queue_work(system_unbound_wq, &rreq->work)) @@ -586,7 +587,8 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ /* If we are at the head of the queue, wake up the collector. */ - if (list_is_first(&subreq->rreq_link, &stream->subrequests)) + if (list_is_first(&subreq->rreq_link, &stream->subrequests) || + test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) netfs_wake_read_collector(rreq); netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated); diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 2290af0d51ac..0f294b26e08c 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -14,7 +14,7 @@ static void netfs_reissue_read(struct netfs_io_request *rreq, { __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + netfs_stat(&netfs_n_rh_retry_read_subreq); subreq->rreq->netfs_ops->issue_read(subreq); } @@ -48,6 +48,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); subreq->retry_count++; netfs_reset_iter(subreq); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_read(rreq, subreq); } } @@ -75,7 +76,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) struct iov_iter source; unsigned long long start, len; size_t part; - bool boundary = false; + bool boundary = false, subreq_superfluous = false; /* Go through the subreqs and find the next span of contiguous * buffer that we then rejig (cifs, for example, needs the @@ -116,8 +117,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) /* Work through the sublist. */ subreq = from; list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { - if (!len) + if (!len) { + subreq_superfluous = true; break; + } subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start - subreq->transferred; subreq->len = len + subreq->transferred; @@ -154,19 +157,21 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_read(rreq, subreq); - if (subreq == to) + if (subreq == to) { + subreq_superfluous = false; break; + } } /* If we managed to use fewer subreqs, we can discard the * excess; if we used the same number, then we're done. */ if (!len) { - if (subreq == to) + if (!subreq_superfluous) continue; list_for_each_entry_safe_from(subreq, tmp, &stream->subrequests, rreq_link) { - trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous); list_del(&subreq->rreq_link); netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); if (subreq == to) @@ -187,14 +192,12 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start; subreq->len = len; - subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); subreq->stream_nr = stream->stream_nr; subreq->retry_count = 1; trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), netfs_sreq_trace_new); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); list_add(&subreq->rreq_link, &to->rreq_link); to = list_next_entry(to, rreq_link); @@ -256,14 +259,34 @@ void netfs_retry_reads(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream = &rreq->io_streams[0]; + DEFINE_WAIT(myself); + + netfs_stat(&netfs_n_rh_retry_read_req); + + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + continue; + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + for (;;) { + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + break; + + trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + + finish_wait(&rreq->waitq, &myself); } + clear_bit(NETFS_RREQ_RETRYING, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); netfs_retry_read_subrequests(rreq); diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index f1af344266cc..ab6b916addc4 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -29,6 +29,8 @@ atomic_t netfs_n_rh_write_begin; atomic_t netfs_n_rh_write_done; atomic_t netfs_n_rh_write_failed; atomic_t netfs_n_rh_write_zskip; +atomic_t netfs_n_rh_retry_read_req; +atomic_t netfs_n_rh_retry_read_subreq; atomic_t netfs_n_wh_buffered_write; atomic_t netfs_n_wh_writethrough; atomic_t netfs_n_wh_dio_write; @@ -41,6 +43,8 @@ atomic_t netfs_n_wh_upload_failed; atomic_t netfs_n_wh_write; atomic_t netfs_n_wh_write_done; atomic_t netfs_n_wh_write_failed; +atomic_t netfs_n_wh_retry_write_req; +atomic_t netfs_n_wh_retry_write_subreq; atomic_t netfs_n_wb_lock_skip; atomic_t netfs_n_wb_lock_wait; atomic_t netfs_n_folioq; @@ -81,6 +85,11 @@ int netfs_stats_show(struct seq_file *m, void *v) atomic_read(&netfs_n_wh_write), atomic_read(&netfs_n_wh_write_done), atomic_read(&netfs_n_wh_write_failed)); + seq_printf(m, "Retries: rq=%u rs=%u wq=%u ws=%u\n", + atomic_read(&netfs_n_rh_retry_read_req), + atomic_read(&netfs_n_rh_retry_read_subreq), + atomic_read(&netfs_n_wh_retry_write_req), + atomic_read(&netfs_n_wh_retry_write_subreq)); seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n", atomic_read(&netfs_n_rh_rreq), atomic_read(&netfs_n_rh_sreq), diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 69727411683e..77279fc5b5a7 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -253,6 +253,7 @@ void netfs_reissue_write(struct netfs_io_stream *stream, subreq->retry_count++; __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_stat(&netfs_n_wh_retry_write_subreq); netfs_do_issue_write(stream, subreq); } diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c index c841a851dd73..545d33079a77 100644 --- a/fs/netfs/write_retry.c +++ b/fs/netfs/write_retry.c @@ -203,6 +203,8 @@ void netfs_retry_writes(struct netfs_io_request *wreq) struct netfs_io_stream *stream; int s; + netfs_stat(&netfs_n_wh_retry_write_req); + /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 492cffd9d3d8..2b04038b0e40 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1672,7 +1672,7 @@ nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, return nfs_lookup_revalidate_done(dir, dentry, inode, 1); } -static int nfs_lookup_revalidate_dentry(struct inode *dir, +static int nfs_lookup_revalidate_dentry(struct inode *dir, const struct qstr *name, struct dentry *dentry, struct inode *inode, unsigned int flags) { @@ -1690,7 +1690,7 @@ static int nfs_lookup_revalidate_dentry(struct inode *dir, goto out; dir_verifier = nfs_save_change_attribute(dir); - ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); + ret = NFS_PROTO(dir)->lookup(dir, dentry, name, fhandle, fattr); if (ret < 0) goto out; @@ -1732,8 +1732,8 @@ out: * cached dentry and do a new lookup. */ static int -nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, - unsigned int flags) +nfs_do_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; int error = 0; @@ -1775,7 +1775,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, if (NFS_STALE(inode)) goto out_bad; - return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags); + return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); out_valid: return nfs_lookup_revalidate_done(dir, dentry, inode, 1); out_bad: @@ -1785,38 +1785,26 @@ out_bad: } static int -__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, - int (*reval)(struct inode *, struct dentry *, unsigned int)) +__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) { - struct dentry *parent; - struct inode *dir; - int ret; - if (flags & LOOKUP_RCU) { if (dentry->d_fsdata == NFS_FSDATA_BLOCKED) return -ECHILD; - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - ret = reval(dir, dentry, flags); - if (parent != READ_ONCE(dentry->d_parent)) - return -ECHILD; } else { /* Wait for unlink to complete - see unblock_revalidate() */ wait_var_event(&dentry->d_fsdata, smp_load_acquire(&dentry->d_fsdata) != NFS_FSDATA_BLOCKED); - parent = dget_parent(dentry); - ret = reval(d_inode(parent), dentry, flags); - dput(parent); } - return ret; + return 0; } -static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int nfs_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate); + if (__nfs_lookup_revalidate(dentry, flags)) + return -ECHILD; + return nfs_do_lookup_revalidate(dir, name, dentry, flags); } static void block_revalidate(struct dentry *dentry) @@ -1982,7 +1970,8 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in dir_verifier = nfs_save_change_attribute(dir); trace_nfs_lookup_enter(dir, dentry, flags); - error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, + fhandle, fattr); if (error == -ENOENT) { if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) dir_verifier = inode_peek_iversion_raw(dir); @@ -2025,7 +2014,8 @@ void nfs_d_prune_case_insensitive_aliases(struct inode *inode) EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases); #if IS_ENABLED(CONFIG_NFS_V4) -static int nfs4_lookup_revalidate(struct dentry *, unsigned int); +static int nfs4_lookup_revalidate(struct inode *, const struct qstr *, + struct dentry *, unsigned int); const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs4_lookup_revalidate, @@ -2214,11 +2204,14 @@ no_open: EXPORT_SYMBOL_GPL(nfs_atomic_open); static int -nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, - unsigned int flags) +nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; + if (__nfs_lookup_revalidate(dentry, flags)) + return -ECHILD; + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) @@ -2254,16 +2247,10 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, reval_dentry: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags); + return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); full_reval: - return nfs_do_lookup_revalidate(dir, dentry, flags); -} - -static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) -{ - return __nfs_lookup_revalidate(dentry, flags, - nfs4_do_lookup_revalidate); + return nfs_do_lookup_revalidate(dir, name, dentry, flags); } #endif /* CONFIG_NFSV4 */ @@ -2319,7 +2306,8 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, d_drop(dentry); if (fhandle->size == 0) { - error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, + fhandle, fattr); if (error) goto out_error; } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 2d53574da605..973aed9cc5fe 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -308,7 +308,7 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server) int err; /* Look it up again to get its attributes */ - err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry, + err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry, &dentry->d_name, ctx->mntfh, ctx->clone_data.fattr); dput(parent); if (err != 0) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 7359e1a3bd84..0c3bc98cd999 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -192,7 +192,7 @@ __nfs3_proc_lookup(struct inode *dir, const char *name, size_t len, } static int -nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, +nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { unsigned short task_flags = 0; @@ -202,8 +202,7 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, task_flags |= RPC_TASK_TIMEOUT; dprintk("NFS call lookup %pd2\n", dentry); - return __nfs3_proc_lookup(dir, dentry->d_name.name, - dentry->d_name.len, fhandle, fattr, + return __nfs3_proc_lookup(dir, name->name, name->len, fhandle, fattr, task_flags); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d615d520f8cf..df9669d4ded7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4544,15 +4544,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, - struct dentry *dentry, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct dentry *dentry, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_server *server = NFS_SERVER(dir); int status; struct nfs4_lookup_arg args = { .bitmask = server->attr_bitmask, .dir_fh = NFS_FH(dir), - .name = &dentry->d_name, + .name = name, }; struct nfs4_lookup_res res = { .server = server, @@ -4594,17 +4594,16 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr) } static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, - struct dentry *dentry, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct dentry *dentry, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs4_exception exception = { .interruptible = true, }; struct rpc_clnt *client = *clnt; - const struct qstr *name = &dentry->d_name; int err; do { - err = _nfs4_proc_lookup(client, dir, dentry, fhandle, fattr); + err = _nfs4_proc_lookup(client, dir, dentry, name, fhandle, fattr); trace_nfs4_lookup(dir, name, err); switch (err) { case -NFS4ERR_BADNAME: @@ -4639,13 +4638,13 @@ out: return err; } -static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry, +static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { int status; struct rpc_clnt *client = NFS_CLIENT(dir); - status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr); + status = nfs4_proc_lookup_common(&client, dir, dentry, name, fhandle, fattr); if (client != NFS_CLIENT(dir)) { rpc_shutdown_client(client); nfs_fixup_secinfo_attributes(fattr); @@ -4660,7 +4659,8 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct dentry *dentry, struct rpc_clnt *client = NFS_CLIENT(dir); int status; - status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr); + status = nfs4_proc_lookup_common(&client, dir, dentry, &dentry->d_name, + fhandle, fattr); if (status < 0) return ERR_PTR(status); return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client; diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 886a7c4c60b3..d1a92d8f8ba4 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -17,7 +17,7 @@ static const int nfs_set_port_min; static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs4_callback_sysctl_table; -static struct ctl_table nfs4_cb_sysctls[] = { +static const struct ctl_table nfs4_cb_sysctls[] = { { .procname = "nfs_callback_tcpport", .data = &nfs_callback_set_tcpport, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 6c09cd090c34..77920a2e3cef 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -153,13 +153,13 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs_proc_lookup(struct inode *dir, struct dentry *dentry, +nfs_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_diropargs arg = { .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len + .name = name->name, + .len = name->len }; struct nfs_diropok res = { .fh = fhandle, diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index e645be1a3381..f579df0e8d67 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -14,7 +14,7 @@ static struct ctl_table_header *nfs_callback_sysctl_table; -static struct ctl_table nfs_cb_sysctls[] = { +static const struct ctl_table nfs_cb_sysctls[] = { { .procname = "nfs_mountpoint_timeout", .data = &nfs_mountpoint_expiry_timeout, diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 0e552d873eaa..fb9b1656a287 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -446,11 +446,20 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) struct nfsd_file, nf_gc); struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id); struct nfsd_fcache_disposal *l = nn->fcache_disposal; + struct svc_serv *serv; spin_lock(&l->lock); list_move_tail(&nf->nf_gc, &l->freeme); spin_unlock(&l->lock); - svc_wake_up(nn->nfsd_serv); + + /* + * The filecache laundrette is shut down after the + * nn->nfsd_serv pointer is cleared, but before the + * svc_serv is freed. + */ + serv = nn->nfsd_serv; + if (serv) + svc_wake_up(serv); } } diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 4e3be7201b1c..5fb202acb0fd 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -84,6 +84,8 @@ out: fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); + resp->acl_access = NULL; + resp->acl_default = NULL; goto out; } diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 5e34e98db969..7b5433bd3019 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -76,6 +76,8 @@ out: fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); + resp->acl_access = NULL; + resp->acl_default = NULL; goto out; } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 50e468bdb8d4..484077200c5d 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -679,7 +679,7 @@ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, return status; status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status); - if (unlikely(status || cb->cb_seq_status)) + if (unlikely(status || cb->cb_status)) return status; if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0) return -NFSERR_BAD_XDR; @@ -1583,8 +1583,11 @@ nfsd4_run_cb_work(struct work_struct *work) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; - if (!clnt) { - /* Callback channel broken, or client killed; give up: */ + if (!clnt || clp->cl_state == NFSD4_COURTESY) { + /* + * Callback channel broken, client killed or + * nfs4_client in courtesy state; give up. + */ nfsd41_destroy_cb(cb); return; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b7a0cfd05401..153eeea2c7c9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4459,10 +4459,11 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } } while (slot && --cnt > 0); } + +out: seq->maxslots = max(session->se_target_maxslots, seq->maxslots); seq->target_maxslots = session->se_target_maxslots; -out: switch (clp->cl_cb_state) { case NFSD4_CB_DOWN: seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 32019751a41e..aef474f1b84b 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -380,8 +380,9 @@ __fh_verify(struct svc_rqst *rqstp, error = check_nfsd_access(exp, rqstp, may_bypass_gss); if (error) goto out; - - svc_xprt_set_valid(rqstp->rq_xprt); + /* During LOCALIO call to fh_verify will be called with a NULL rqstp */ + if (rqstp) + svc_xprt_set_valid(rqstp->rq_xprt); /* Finally, check access permissions. */ error = nfsd_permission(cred, exp, dentry, access); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index e8015d24a82c..6613b8fcceb0 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1186,7 +1186,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (size) { if (phys && blkphy << blkbits == phys + size) { /* The current extent goes on */ - size += n << blkbits; + size += (u64)n << blkbits; } else { /* Terminate the current extent */ ret = fiemap_fill_next_extent( @@ -1199,14 +1199,14 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags = FIEMAP_EXTENT_MERGED; logical = blkoff << blkbits; phys = blkphy << blkbits; - size = n << blkbits; + size = (u64)n << blkbits; } } else { /* Start a new extent */ flags = FIEMAP_EXTENT_MERGED; logical = blkoff << blkbits; phys = blkphy << blkbits; - size = n << blkbits; + size = (u64)n << blkbits; } blkoff += n; } diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 6004dfdfdf0f..c4cdaf5fa7ed 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -20,7 +20,7 @@ static int dir_notify_enable __read_mostly = 1; #ifdef CONFIG_SYSCTL -static struct ctl_table dnotify_sysctls[] = { +static const struct ctl_table dnotify_sysctls[] = { { .procname = "dir-notify-enable", .data = &dir_notify_enable, diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 6ff94e312232..ba3e2d09eb44 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -58,7 +58,7 @@ static int fanotify_max_queued_events __read_mostly; static long ft_zero = 0; static long ft_int_max = INT_MAX; -static struct ctl_table fanotify_table[] = { +static const struct ctl_table fanotify_table[] = { { .procname = "max_user_groups", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 8ee495a58d0a..fae1b6d397ea 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -648,7 +648,7 @@ EXPORT_SYMBOL_GPL(fsnotify); * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ -void file_set_fsnotify_mode(struct file *file) +void file_set_fsnotify_mode_from_watchers(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; @@ -665,7 +665,7 @@ void file_set_fsnotify_mode(struct file *file) */ if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { - file->f_mode |= FMODE_NONOTIFY_PERM; + file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return; } @@ -676,7 +676,7 @@ void file_set_fsnotify_mode(struct file *file) if ((!d_is_dir(dentry) && !d_is_reg(dentry)) || likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT))) { - file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; + file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); return; } @@ -686,19 +686,25 @@ void file_set_fsnotify_mode(struct file *file) */ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask, - FSNOTIFY_PRE_CONTENT_EVENTS))) + FSNOTIFY_PRE_CONTENT_EVENTS))) { + /* Enable pre-content events */ + file_set_fsnotify_mode(file, 0); return; + } /* Is parent watching for pre-content events on this file? */ if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { parent = dget_parent(dentry); p_mask = fsnotify_inode_watches_children(d_inode(parent)); dput(parent); - if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) + if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) { + /* Enable pre-content events */ + file_set_fsnotify_mode(file, 0); return; + } } /* Nobody watching for pre-content events from this file */ - file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; + file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); } #endif diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index e0c48956608a..b372fb2c56bd 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -58,7 +58,7 @@ struct kmem_cache *inotify_inode_mark_cachep __ro_after_init; static long it_zero = 0; static long it_int_max = INT_MAX; -static struct ctl_table inotify_table[] = { +static const struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 8d789b017fa9..af94e3737470 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -787,7 +787,8 @@ pack_runs: if (err) goto out; - attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id); + attr = mi_find_attr(ni, mi, NULL, type, name, name_len, + &le->id); if (!attr) { err = -EINVAL; goto bad_inode; @@ -1181,7 +1182,7 @@ repack: goto out; } - attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, &le->id); + attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, &le->id); if (!attr) { err = -EINVAL; goto out; @@ -1406,7 +1407,7 @@ int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr, */ if (!attr->non_res) { if (vbo[1] + bytes_per_off > le32_to_cpu(attr->res.data_size)) { - ntfs_inode_err(&ni->vfs_inode, "is corrupted"); + _ntfs_bad_inode(&ni->vfs_inode); return -EINVAL; } addr = resident_data(attr); @@ -1796,7 +1797,7 @@ repack: goto out; } - attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, + attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, &le->id); if (!attr) { err = -EINVAL; @@ -2041,8 +2042,8 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) } /* Look for required attribute. */ - attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, - 0, &le->id); + attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, + NULL, 0, &le->id); if (!attr) { err = -EINVAL; goto out; @@ -2587,7 +2588,7 @@ int attr_force_nonresident(struct ntfs_inode *ni) attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi); if (!attr) { - ntfs_bad_inode(&ni->vfs_inode, "no data attribute"); + _ntfs_bad_inode(&ni->vfs_inode); return -ENOENT; } diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index fc6a8aa29e3a..b6da80c69ca6 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -512,7 +512,7 @@ out: ctx->pos = pos; } else if (err < 0) { if (err == -EINVAL) - ntfs_inode_err(dir, "directory corrupted"); + _ntfs_bad_inode(dir); ctx->pos = eod; } diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 8b39d0ce5f28..5df6a0b5add9 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -75,7 +75,7 @@ struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni) { const struct ATTRIB *attr; - attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); + attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_STD, NULL, 0, NULL); return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) : NULL; } @@ -89,7 +89,7 @@ struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni) { const struct ATTRIB *attr; - attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); + attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_STD, NULL, 0, NULL); return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) : NULL; @@ -148,8 +148,10 @@ int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi) goto out; err = mi_get(ni->mi.sbi, rno, &r); - if (err) + if (err) { + _ntfs_bad_inode(&ni->vfs_inode); return err; + } ni_add_mi(ni, r); @@ -201,7 +203,8 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, *mi = &ni->mi; /* Look for required attribute in primary record. */ - return mi_find_attr(&ni->mi, attr, type, name, name_len, NULL); + return mi_find_attr(ni, &ni->mi, attr, type, name, name_len, + NULL); } /* First look for list entry of required type. */ @@ -217,7 +220,7 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, return NULL; /* Look for required attribute. */ - attr = mi_find_attr(m, NULL, type, name, name_len, &le->id); + attr = mi_find_attr(ni, m, NULL, type, name, name_len, &le->id); if (!attr) goto out; @@ -238,8 +241,7 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, return attr; out: - ntfs_inode_err(&ni->vfs_inode, "failed to parse mft record"); - ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + _ntfs_bad_inode(&ni->vfs_inode); return NULL; } @@ -259,7 +261,7 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, if (mi) *mi = &ni->mi; /* Enum attributes in primary record. */ - return mi_enum_attr(&ni->mi, attr); + return mi_enum_attr(ni, &ni->mi, attr); } /* Get next list entry. */ @@ -275,7 +277,7 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, *mi = mi2; /* Find attribute in loaded record. */ - return rec_find_attr_le(mi2, le2); + return rec_find_attr_le(ni, mi2, le2); } /* @@ -293,7 +295,8 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, if (!ni->attr_list.size) { if (pmi) *pmi = &ni->mi; - return mi_find_attr(&ni->mi, NULL, type, name, name_len, NULL); + return mi_find_attr(ni, &ni->mi, NULL, type, name, name_len, + NULL); } le = al_find_ex(ni, NULL, type, name, name_len, NULL); @@ -319,7 +322,7 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, if (pmi) *pmi = mi; - attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id); + attr = mi_find_attr(ni, mi, NULL, type, name, name_len, &le->id); if (!attr) return NULL; @@ -330,6 +333,7 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, vcn <= le64_to_cpu(attr->nres.evcn)) return attr; + _ntfs_bad_inode(&ni->vfs_inode); return NULL; } @@ -398,7 +402,8 @@ int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, int diff; if (base_only || type == ATTR_LIST || !ni->attr_list.size) { - attr = mi_find_attr(&ni->mi, NULL, type, name, name_len, id); + attr = mi_find_attr(ni, &ni->mi, NULL, type, name, name_len, + id); if (!attr) return -ENOENT; @@ -437,7 +442,7 @@ next_le2: al_remove_le(ni, le); - attr = mi_find_attr(mi, NULL, type, name, name_len, id); + attr = mi_find_attr(ni, mi, NULL, type, name, name_len, id); if (!attr) return -ENOENT; @@ -485,7 +490,7 @@ ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi, name = le->name; } - attr = mi_insert_attr(mi, type, name, name_len, asize, name_off); + attr = mi_insert_attr(ni, mi, type, name, name_len, asize, name_off); if (!attr) { if (le_added) al_remove_le(ni, le); @@ -673,7 +678,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) if (err) return err; - attr_list = mi_find_attr(&ni->mi, NULL, ATTR_LIST, NULL, 0, NULL); + attr_list = mi_find_attr(ni, &ni->mi, NULL, ATTR_LIST, NULL, 0, NULL); if (!attr_list) return 0; @@ -695,7 +700,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) if (!mi) return 0; - attr = mi_find_attr(mi, NULL, le->type, le_name(le), + attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le), le->name_len, &le->id); if (!attr) return 0; @@ -731,7 +736,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) goto out; } - attr = mi_find_attr(mi, NULL, le->type, le_name(le), + attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le), le->name_len, &le->id); if (!attr) { /* Should never happened, 'cause already checked. */ @@ -740,7 +745,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) asize = le32_to_cpu(attr->size); /* Insert into primary record. */ - attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le), + attr_ins = mi_insert_attr(ni, &ni->mi, le->type, le_name(le), le->name_len, asize, le16_to_cpu(attr->name_off)); if (!attr_ins) { @@ -768,7 +773,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) if (!mi) continue; - attr = mi_find_attr(mi, NULL, le->type, le_name(le), + attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le), le->name_len, &le->id); if (!attr) continue; @@ -831,7 +836,7 @@ int ni_create_attr_list(struct ntfs_inode *ni) free_b = 0; attr = NULL; - for (; (attr = mi_enum_attr(&ni->mi, attr)); le = Add2Ptr(le, sz)) { + for (; (attr = mi_enum_attr(ni, &ni->mi, attr)); le = Add2Ptr(le, sz)) { sz = le_size(attr->name_len); le->type = attr->type; le->size = cpu_to_le16(sz); @@ -886,7 +891,7 @@ int ni_create_attr_list(struct ntfs_inode *ni) u32 asize = le32_to_cpu(b->size); u16 name_off = le16_to_cpu(b->name_off); - attr = mi_insert_attr(mi, b->type, Add2Ptr(b, name_off), + attr = mi_insert_attr(ni, mi, b->type, Add2Ptr(b, name_off), b->name_len, asize, name_off); if (!attr) goto out; @@ -909,7 +914,7 @@ int ni_create_attr_list(struct ntfs_inode *ni) goto out; } - attr = mi_insert_attr(&ni->mi, ATTR_LIST, NULL, 0, + attr = mi_insert_attr(ni, &ni->mi, ATTR_LIST, NULL, 0, lsize + SIZEOF_RESIDENT, SIZEOF_RESIDENT); if (!attr) goto out; @@ -993,13 +998,13 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, mi = rb_entry(node, struct mft_inode, node); if (is_mft_data && - (mi_enum_attr(mi, NULL) || + (mi_enum_attr(ni, mi, NULL) || vbo <= ((u64)mi->rno << sbi->record_bits))) { /* We can't accept this record 'cause MFT's bootstrapping. */ continue; } if (is_mft && - mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, NULL)) { + mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, NULL)) { /* * This child record already has a ATTR_DATA. * So it can't accept any other records. @@ -1008,7 +1013,7 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, } if ((type != ATTR_NAME || name_len) && - mi_find_attr(mi, NULL, type, name, name_len, NULL)) { + mi_find_attr(ni, mi, NULL, type, name, name_len, NULL)) { /* Only indexed attributes can share same record. */ continue; } @@ -1157,7 +1162,7 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, /* Estimate the result of moving all possible attributes away. */ attr = NULL; - while ((attr = mi_enum_attr(&ni->mi, attr))) { + while ((attr = mi_enum_attr(ni, &ni->mi, attr))) { if (attr->type == ATTR_STD) continue; if (attr->type == ATTR_LIST) @@ -1175,7 +1180,7 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, attr = NULL; for (;;) { - attr = mi_enum_attr(&ni->mi, attr); + attr = mi_enum_attr(ni, &ni->mi, attr); if (!attr) { /* We should never be here 'cause we have already check this case. */ err = -EINVAL; @@ -1259,7 +1264,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni) for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) { mi = rb_entry(node, struct mft_inode, node); - attr = mi_enum_attr(mi, NULL); + attr = mi_enum_attr(ni, mi, NULL); if (!attr) { mft_min = mi->rno; @@ -1280,7 +1285,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni) ni_remove_mi(ni, mi_new); } - attr = mi_find_attr(&ni->mi, NULL, ATTR_DATA, NULL, 0, NULL); + attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_DATA, NULL, 0, NULL); if (!attr) { err = -EINVAL; goto out; @@ -1397,7 +1402,7 @@ int ni_expand_list(struct ntfs_inode *ni) continue; /* Find attribute in primary record. */ - attr = rec_find_attr_le(&ni->mi, le); + attr = rec_find_attr_le(ni, &ni->mi, le); if (!attr) { err = -EINVAL; goto out; @@ -1604,8 +1609,8 @@ int ni_delete_all(struct ntfs_inode *ni) roff = le16_to_cpu(attr->nres.run_off); if (roff > asize) { - _ntfs_bad_inode(&ni->vfs_inode); - return -EINVAL; + /* ni_enum_attr_ex checks this case. */ + continue; } /* run==1 means unpack and deallocate. */ @@ -2726,9 +2731,10 @@ int ni_write_frame(struct ntfs_inode *ni, struct page **pages, { int err; struct ntfs_sb_info *sbi = ni->mi.sbi; + struct folio *folio = page_folio(pages[0]); u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; u32 frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT; - u64 frame_vbo = (u64)pages[0]->index << PAGE_SHIFT; + u64 frame_vbo = folio_pos(folio); CLST frame = frame_vbo >> frame_bits; char *frame_ondisk = NULL; struct page **pages_disk = NULL; @@ -3343,7 +3349,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) if (!mi->dirty) continue; - is_empty = !mi_enum_attr(mi, NULL); + is_empty = !mi_enum_attr(ni, mi, NULL); if (is_empty) clear_rec_inuse(mi->mrec); diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 03471bc9371c..938d351ebac7 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -908,7 +908,11 @@ void ntfs_bad_inode(struct inode *inode, const char *hint) ntfs_inode_err(inode, "%s", hint); make_bad_inode(inode); - ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + /* Avoid recursion if bad inode is $Volume. */ + if (inode->i_ino != MFT_REC_VOL && + !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING)) { + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + } } /* diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 9089c58a005c..7eb9fae22f8d 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -1094,8 +1094,7 @@ int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn, ok: if (!index_buf_check(ib, bytes, &vbn)) { - ntfs_inode_err(&ni->vfs_inode, "directory corrupted"); - ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + _ntfs_bad_inode(&ni->vfs_inode); err = -EINVAL; goto out; } @@ -1117,8 +1116,7 @@ ok: out: if (err == -E_NTFS_CORRUPT) { - ntfs_inode_err(&ni->vfs_inode, "directory corrupted"); - ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + _ntfs_bad_inode(&ni->vfs_inode); err = -EINVAL; } diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index be04d2845bb7..a1e11228dafd 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -410,6 +410,9 @@ end_enum: if (!std5) goto out; + if (is_bad_inode(inode)) + goto out; + if (!is_match && name) { err = -ENOENT; goto out; diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index cd8e8374bb5a..382820464dee 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -745,23 +745,24 @@ int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi); void mi_put(struct mft_inode *mi); int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno); int mi_read(struct mft_inode *mi, bool is_mft); -struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr); -// TODO: id? -struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr, - enum ATTR_TYPE type, const __le16 *name, - u8 name_len, const __le16 *id); -static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec, +struct ATTRIB *mi_enum_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr); +struct ATTRIB *mi_find_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, const __le16 *id); +static inline struct ATTRIB *rec_find_attr_le(struct ntfs_inode *ni, + struct mft_inode *rec, struct ATTR_LIST_ENTRY *le) { - return mi_find_attr(rec, NULL, le->type, le_name(le), le->name_len, + return mi_find_attr(ni, rec, NULL, le->type, le_name(le), le->name_len, &le->id); } int mi_write(struct mft_inode *mi, int wait); int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, __le16 flags, bool is_mft); -struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, - const __le16 *name, u8 name_len, u32 asize, - u16 name_off); +struct ATTRIB *mi_insert_attr(struct ntfs_inode *ni, struct mft_inode *mi, + enum ATTR_TYPE type, const __le16 *name, + u8 name_len, u32 asize, u16 name_off); bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTRIB *attr); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 61d53d39f3b9..714c7ecedca8 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -31,7 +31,7 @@ static inline int compare_attr(const struct ATTRIB *left, enum ATTR_TYPE type, * * Return: Unused attribute id that is less than mrec->next_attr_id. */ -static __le16 mi_new_attt_id(struct mft_inode *mi) +static __le16 mi_new_attt_id(struct ntfs_inode *ni, struct mft_inode *mi) { u16 free_id, max_id, t16; struct MFT_REC *rec = mi->mrec; @@ -52,7 +52,7 @@ static __le16 mi_new_attt_id(struct mft_inode *mi) attr = NULL; for (;;) { - attr = mi_enum_attr(mi, attr); + attr = mi_enum_attr(ni, mi, attr); if (!attr) { rec->next_attr_id = cpu_to_le16(max_id + 1); mi->dirty = true; @@ -195,7 +195,8 @@ out: * NOTE: mi->mrec - memory of size sbi->record_size * here we sure that mi->mrec->total == sbi->record_size (see mi_read) */ -struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) +struct ATTRIB *mi_enum_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr) { const struct MFT_REC *rec = mi->mrec; u32 used = le32_to_cpu(rec->used); @@ -209,11 +210,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) off = le16_to_cpu(rec->attr_off); if (used > total) - return NULL; + goto out; if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 || !IS_ALIGNED(off, 8)) { - return NULL; + goto out; } /* Skip non-resident records. */ @@ -243,7 +244,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) */ if (off + 8 > used) { static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8); - return NULL; + goto out; } if (attr->type == ATTR_END) { @@ -254,112 +255,116 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) /* 0x100 is last known attribute for now. */ t32 = le32_to_cpu(attr->type); if (!t32 || (t32 & 0xf) || (t32 > 0x100)) - return NULL; + goto out; /* attributes in record must be ordered by type */ if (t32 < prev_type) - return NULL; + goto out; asize = le32_to_cpu(attr->size); if (!IS_ALIGNED(asize, 8)) - return NULL; + goto out; /* Check overflow and boundary. */ if (off + asize < off || off + asize > used) - return NULL; + goto out; /* Can we use the field attr->non_res. */ if (off + 9 > used) - return NULL; + goto out; /* Check size of attribute. */ if (!attr->non_res) { /* Check resident fields. */ if (asize < SIZEOF_RESIDENT) - return NULL; + goto out; t16 = le16_to_cpu(attr->res.data_off); if (t16 > asize) - return NULL; + goto out; if (le32_to_cpu(attr->res.data_size) > asize - t16) - return NULL; + goto out; t32 = sizeof(short) * attr->name_len; if (t32 && le16_to_cpu(attr->name_off) + t32 > t16) - return NULL; + goto out; return attr; } /* Check nonresident fields. */ if (attr->non_res != 1) - return NULL; + goto out; /* Can we use memory including attr->nres.valid_size? */ if (asize < SIZEOF_NONRESIDENT) - return NULL; + goto out; t16 = le16_to_cpu(attr->nres.run_off); if (t16 > asize) - return NULL; + goto out; t32 = sizeof(short) * attr->name_len; if (t32 && le16_to_cpu(attr->name_off) + t32 > t16) - return NULL; + goto out; /* Check start/end vcn. */ if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1) - return NULL; + goto out; data_size = le64_to_cpu(attr->nres.data_size); if (le64_to_cpu(attr->nres.valid_size) > data_size) - return NULL; + goto out; alloc_size = le64_to_cpu(attr->nres.alloc_size); if (data_size > alloc_size) - return NULL; + goto out; t32 = mi->sbi->cluster_mask; if (alloc_size & t32) - return NULL; + goto out; if (!attr->nres.svcn && is_attr_ext(attr)) { /* First segment of sparse/compressed attribute */ /* Can we use memory including attr->nres.total_size? */ if (asize < SIZEOF_NONRESIDENT_EX) - return NULL; + goto out; tot_size = le64_to_cpu(attr->nres.total_size); if (tot_size & t32) - return NULL; + goto out; if (tot_size > alloc_size) - return NULL; + goto out; } else { if (attr->nres.c_unit) - return NULL; + goto out; if (alloc_size > mi->sbi->volume.size) - return NULL; + goto out; } return attr; + +out: + _ntfs_bad_inode(&ni->vfs_inode); + return NULL; } /* * mi_find_attr - Find the attribute by type and name and id. */ -struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr, - enum ATTR_TYPE type, const __le16 *name, - u8 name_len, const __le16 *id) +struct ATTRIB *mi_find_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, const __le16 *id) { u32 type_in = le32_to_cpu(type); u32 atype; next_attr: - attr = mi_enum_attr(mi, attr); + attr = mi_enum_attr(ni, mi, attr); if (!attr) return NULL; @@ -467,9 +472,9 @@ int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, * * Return: Not full constructed attribute or NULL if not possible to create. */ -struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, - const __le16 *name, u8 name_len, u32 asize, - u16 name_off) +struct ATTRIB *mi_insert_attr(struct ntfs_inode *ni, struct mft_inode *mi, + enum ATTR_TYPE type, const __le16 *name, + u8 name_len, u32 asize, u16 name_off) { size_t tail; struct ATTRIB *attr; @@ -488,7 +493,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, * at which we should insert it. */ attr = NULL; - while ((attr = mi_enum_attr(mi, attr))) { + while ((attr = mi_enum_attr(ni, mi, attr))) { int diff = compare_attr(attr, type, name, name_len, upcase); if (diff < 0) @@ -508,7 +513,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, tail = used - PtrOffset(rec, attr); } - id = mi_new_attt_id(mi); + id = mi_new_attt_id(ni, mi); memmove(Add2Ptr(attr, asize), attr, tail); memset(attr, 0, asize); diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index a9b8688aaf30..1873bbbb7e5b 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -32,7 +32,8 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry) } -static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) +static int ocfs2_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; int ret = 0; /* if all else fails, just return false */ @@ -44,8 +45,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) inode = d_inode(dentry); osb = OCFS2_SB(dentry->d_sb); - trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len, - dentry->d_name.name); + trace_ocfs2_dentry_revalidate(dentry, name->len, name->name); /* For a negative dentry - * check the generation number of the parent and compare with the @@ -53,12 +53,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) */ if (inode == NULL) { unsigned long gen = (unsigned long) dentry->d_fsdata; - unsigned long pgen; - spin_lock(&dentry->d_lock); - pgen = OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; - spin_unlock(&dentry->d_lock); - trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len, - dentry->d_name.name, + unsigned long pgen = OCFS2_I(dir)->ip_dir_lock_gen; + trace_ocfs2_dentry_revalidate_negative(name->len, name->name, pgen, gen); if (gen != pgen) goto bail; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 20aa37b67cfb..ddd761cf44c8 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -650,7 +650,7 @@ error: * and easier to preserve the name. */ -static struct ctl_table ocfs2_nm_table[] = { +static const struct ctl_table ocfs2_nm_table[] = { { .procname = "hb_ctl_path", .data = ocfs2_hb_ctl_path, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index e0b91dbaa0ac..8bb5022f3082 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2285,7 +2285,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, mlog(ML_ERROR, "found superblock with incorrect block " "size bits: found %u, should be 9, 10, 11, or 12\n", blksz_bits); - } else if ((1 << le32_to_cpu(blksz_bits)) != blksz) { + } else if ((1 << blksz_bits) != blksz) { mlog(ML_ERROR, "found superblock with incorrect block " "size: found %u, should be %u\n", 1 << blksz_bits, blksz); } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != diff --git a/fs/open.c b/fs/open.c index 932e5a6de63b..1be20de9f283 100644 --- a/fs/open.c +++ b/fs/open.c @@ -905,7 +905,8 @@ static int do_dentry_open(struct file *f, f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { - f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY; + f->f_mode = FMODE_PATH | FMODE_OPENED; + file_set_fsnotify_mode(f, FMODE_NONOTIFY); f->f_op = &empty_fops; return 0; } @@ -935,10 +936,10 @@ static int do_dentry_open(struct file *f, /* * Set FMODE_NONOTIFY_* bits according to existing permission watches. - * If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't - * change anything. + * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a + * pseudo file, this call will not change the mode. */ - file_set_fsnotify_mode(f); + file_set_fsnotify_mode_from_watchers(f); error = fsnotify_open_perm(f); if (error) goto cleanup_all; @@ -1122,7 +1123,7 @@ struct file *dentry_open_nonotify(const struct path *path, int flags, if (!IS_ERR(f)) { int error; - f->f_mode |= FMODE_NONOTIFY; + file_set_fsnotify_mode(f, FMODE_NONOTIFY); error = vfs_open(path, f); if (error) { fput(f); diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c index 395a00ed8ac7..a19d1ad705db 100644 --- a/fs/orangefs/dcache.c +++ b/fs/orangefs/dcache.c @@ -13,10 +13,9 @@ #include "orangefs-kernel.h" /* Returns 1 if dentry can still be trusted, else 0. */ -static int orangefs_revalidate_lookup(struct dentry *dentry) +static int orangefs_revalidate_lookup(struct inode *parent_inode, const struct qstr *name, + struct dentry *dentry) { - struct dentry *parent_dentry = dget_parent(dentry); - struct inode *parent_inode = parent_dentry->d_inode; struct orangefs_inode_s *parent = ORANGEFS_I(parent_inode); struct inode *inode = dentry->d_inode; struct orangefs_kernel_op_s *new_op; @@ -26,14 +25,14 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__); new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP); - if (!new_op) { - ret = -ENOMEM; - goto out_put_parent; - } + if (!new_op) + return -ENOMEM; new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW; new_op->upcall.req.lookup.parent_refn = parent->refn; - strscpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name); + /* op_alloc() leaves ->upcall zeroed */ + memcpy(new_op->upcall.req.lookup.d_name, name->name, + min(name->len, ORANGEFS_NAME_MAX - 1)); gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d interrupt flag [%d]\n", @@ -78,8 +77,6 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) ret = 1; out_release_op: op_release(new_op); -out_put_parent: - dput(parent_dentry); return ret; out_drop: gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d revalidate failed\n", @@ -92,7 +89,8 @@ out_drop: * * Should return 1 if dentry can still be trusted, else 0. */ -static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags) +static int orangefs_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { int ret; unsigned long time = (unsigned long) dentry->d_fsdata; @@ -114,7 +112,7 @@ static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags) * If this passes, the positive dentry still exists or the negative * dentry still does not exist. */ - if (!orangefs_revalidate_lookup(dentry)) + if (!orangefs_revalidate_lookup(dir, name, dentry)) return 0; /* We do not need to continue with negative dentries. */ diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 9729f071c5aa..f52073022fae 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -392,9 +392,9 @@ static ssize_t orangefs_debug_write(struct file *file, * Thwart users who try to jamb a ridiculous number * of bytes into the debug file... */ - if (count > ORANGEFS_MAX_DEBUG_STRING_LEN + 1) { + if (count > ORANGEFS_MAX_DEBUG_STRING_LEN) { silly = count; - count = ORANGEFS_MAX_DEBUG_STRING_LEN + 1; + count = ORANGEFS_MAX_DEBUG_STRING_LEN; } buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index cea820cb3b55..be5c65d6f848 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -14,8 +14,6 @@ #include <linux/exportfs.h> #include "overlayfs.h" -#include "../internal.h" /* for vfs_path_lookup */ - struct ovl_lookup_data { struct super_block *sb; const struct ovl_layer *layer; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index fe511192f83c..86ae6f6da36b 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -91,7 +91,24 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) ret = d->d_op->d_weak_revalidate(d, flags); } else if (d->d_flags & DCACHE_OP_REVALIDATE) { - ret = d->d_op->d_revalidate(d, flags); + struct dentry *parent; + struct inode *dir; + struct name_snapshot n; + + if (flags & LOOKUP_RCU) { + parent = READ_ONCE(d->d_parent); + dir = d_inode_rcu(parent); + if (!dir) + return -ECHILD; + } else { + parent = dget_parent(d); + dir = d_inode(parent); + } + take_dentry_name_snapshot(&n, d); + ret = d->d_op->d_revalidate(dir, &n.name, d, flags); + release_dentry_name_snapshot(&n); + if (!(flags & LOOKUP_RCU)) + dput(parent); if (!ret) { if (!(flags & LOOKUP_RCU)) d_invalidate(d); @@ -127,7 +144,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, return ret; } -static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +static int ovl_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { return ovl_dentry_revalidate_common(dentry, flags, false); } diff --git a/fs/pidfs.c b/fs/pidfs.c index 049352f973de..63f9699ebac3 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -287,7 +287,6 @@ static bool pidfs_ioctl_valid(unsigned int cmd) switch (cmd) { case FS_IOC_GETVERSION: case PIDFD_GET_CGROUP_NAMESPACE: - case PIDFD_GET_INFO: case PIDFD_GET_IPC_NAMESPACE: case PIDFD_GET_MNT_NAMESPACE: case PIDFD_GET_NET_NAMESPACE: @@ -300,6 +299,17 @@ static bool pidfs_ioctl_valid(unsigned int cmd) return true; } + /* Extensible ioctls require some more careful checks. */ + switch (_IOC_NR(cmd)) { + case _IOC_NR(PIDFD_GET_INFO): + /* + * Try to prevent performing a pidfd ioctl when someone + * erronously mistook the file descriptor for a pidfd. + * This is not perfect but will catch most cases. + */ + return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); + } + return false; } diff --git a/fs/pipe.c b/fs/pipe.c index 82fede0f2111..ce1af7592780 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -960,6 +960,12 @@ int create_pipe_files(struct file **res, int flags) res[1] = f; stream_open(inode, res[0]); stream_open(inode, res[1]); + /* + * Disable permission and pre-content events, but enable legacy + * inotify events for legacy users. + */ + file_set_fsnotify_mode(res[0], FMODE_NONOTIFY_PERM); + file_set_fsnotify_mode(res[1], FMODE_NONOTIFY_PERM); return 0; } @@ -1478,7 +1484,7 @@ static int proc_dopipe_max_size(const struct ctl_table *table, int write, do_proc_dopipe_max_size_conv, NULL); } -static struct ctl_table fs_pipe_sysctls[] = { +static const struct ctl_table fs_pipe_sysctls[] = { { .procname = "pipe-max-size", .data = &pipe_max_size, diff --git a/fs/proc/base.c b/fs/proc/base.c index a50b222a5917..cd89e956c322 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2058,7 +2058,8 @@ void pid_update_inode(struct task_struct *task, struct inode *inode) * performed a setuid(), etc. * */ -static int pid_revalidate(struct dentry *dentry, unsigned int flags) +static int pid_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; @@ -2191,7 +2192,8 @@ static int dname_to_vma_addr(struct dentry *dentry, return 0; } -static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) +static int map_files_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; bool exact_vma_exists = false; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 24baf23e864f..37aa778d1af7 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -140,7 +140,8 @@ static void tid_fd_update_inode(struct task_struct *task, struct inode *inode, security_task_to_inode(task, inode); } -static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) +static int tid_fd_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct task_struct *task; struct inode *inode; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index dbe82cf23ee4..8ec90826a49e 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -216,7 +216,8 @@ void proc_free_inum(unsigned int inum) ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); } -static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags) +static int proc_misc_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; @@ -343,7 +344,8 @@ static const struct file_operations proc_dir_operations = { .iterate_shared = proc_readdir, }; -static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) +static int proc_net_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { return 0; } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 27a283d85a6e..cc9d74a06ff0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -884,7 +884,8 @@ static const struct inode_operations proc_sys_dir_operations = { .getattr = proc_sys_getattr, }; -static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) +static int proc_sys_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a00120a3c099..10d01eb09c43 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1524,7 +1524,7 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) pr_warn_once("Unexpected adding of device dump\n"); if (vmcore_open) { ret = -EBUSY; - goto out_err; + goto unlock; } list_add_tail(&dump->list, &vmcoredd_list); @@ -1532,6 +1532,9 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) mutex_unlock(&vmcore_mutex); return 0; +unlock: + mutex_unlock(&vmcore_mutex); + out_err: vfree(buf); vfree(dump); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f9578918cfb2..825c5c2e0962 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2926,7 +2926,7 @@ static int do_proc_dqstats(const struct ctl_table *table, int write, return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -static struct ctl_table fs_dqstats_table[] = { +static const struct ctl_table fs_dqstats_table[] = { { .procname = "lookups", .data = &dqstats.stat[DQST_LOOKUPS], diff --git a/fs/smb/client/asn1.c b/fs/smb/client/asn1.c index b5724ef9f182..214a44509e7b 100644 --- a/fs/smb/client/asn1.c +++ b/fs/smb/client/asn1.c @@ -52,6 +52,8 @@ int cifs_neg_token_init_mech_type(void *context, size_t hdrlen, server->sec_kerberos = true; else if (oid == OID_ntlmssp) server->sec_ntlmssp = true; + else if (oid == OID_IAKerb) + server->sec_iakerb = true; else { char buf[50]; diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c index 28f568b5fc27..bc1c1e9b288a 100644 --- a/fs/smb/client/cifs_spnego.c +++ b/fs/smb/client/cifs_spnego.c @@ -138,11 +138,13 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo, dp = description + strlen(description); - /* for now, only sec=krb5 and sec=mskrb5 are valid */ + /* for now, only sec=krb5 and sec=mskrb5 and iakerb are valid */ if (server->sec_kerberos) sprintf(dp, ";sec=krb5"); else if (server->sec_mskerberos) sprintf(dp, ";sec=mskrb5"); + else if (server->sec_iakerb) + sprintf(dp, ";sec=iakerb"); else { cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n"); sprintf(dp, ";sec=krb5"); diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index ba79aa2107cc..699a3f76d083 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -1395,7 +1395,7 @@ chown_chgrp_exit: #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, const struct cifs_fid *cifsfid, u32 *pacllen, - u32 __maybe_unused unused) + u32 info) { struct smb_ntsd *pntsd = NULL; unsigned int xid; @@ -1407,7 +1407,7 @@ struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, xid = get_xid(); rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd, - pacllen); + pacllen, info); free_xid(xid); cifs_put_tlink(tlink); @@ -1419,7 +1419,7 @@ struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, } static struct smb_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, - const char *path, u32 *pacllen) + const char *path, u32 *pacllen, u32 info) { struct smb_ntsd *pntsd = NULL; int oplock = 0; @@ -1446,9 +1446,12 @@ static struct smb_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, .fid = &fid, }; + if (info & SACL_SECINFO) + oparms.desired_access |= SYSTEM_SECURITY; + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (!rc) { - rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen); + rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen, info); CIFSSMBClose(xid, tcon, fid.netfid); } @@ -1472,7 +1475,7 @@ struct smb_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, if (inode) open_file = find_readable_file(CIFS_I(inode), true); if (!open_file) - return get_cifs_acl_by_path(cifs_sb, path, pacllen); + return get_cifs_acl_by_path(cifs_sb, path, pacllen, info); pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen, info); cifsFileInfo_put(open_file); @@ -1485,7 +1488,7 @@ int set_cifs_acl(struct smb_ntsd *pnntsd, __u32 acllen, { int oplock = 0; unsigned int xid; - int rc, access_flags; + int rc, access_flags = 0; struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); @@ -1498,10 +1501,12 @@ int set_cifs_acl(struct smb_ntsd *pnntsd, __u32 acllen, tcon = tlink_tcon(tlink); xid = get_xid(); - if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP) - access_flags = WRITE_OWNER; - else - access_flags = WRITE_DAC; + if (aclflag & CIFS_ACL_OWNER || aclflag & CIFS_ACL_GROUP) + access_flags |= WRITE_OWNER; + if (aclflag & CIFS_ACL_SACL) + access_flags |= SYSTEM_SECURITY; + if (aclflag & CIFS_ACL_DACL) + access_flags |= WRITE_DAC; oparms = (struct cifs_open_parms) { .tcon = tcon, diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index b800c9f585d8..6a3bd652d251 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -715,6 +715,12 @@ cifs_show_options(struct seq_file *s, struct dentry *root) cifs_sb->ctx->backupgid)); seq_show_option(s, "reparse", cifs_reparse_type_str(cifs_sb->ctx->reparse_type)); + if (cifs_sb->ctx->nonativesocket) + seq_puts(s, ",nonativesocket"); + else + seq_puts(s, ",nativesocket"); + seq_show_option(s, "symlink", + cifs_symlink_type_str(get_cifs_symlink_type(cifs_sb))); seq_printf(s, ",rsize=%u", cifs_sb->ctx->rsize); seq_printf(s, ",wsize=%u", cifs_sb->ctx->wsize); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index a762dbbbd959..831fee962c4d 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -146,6 +146,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 52 -#define CIFS_VERSION "2.52" +#define SMB3_PRODUCT_BUILD 53 +#define CIFS_VERSION "2.53" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 49ffc040f736..cddeb2adbf4a 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -151,6 +151,7 @@ enum securityEnum { NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ Kerberos, /* Kerberos via SPNEGO */ + IAKerb, /* Kerberos proxy */ }; enum upcall_target_enum { @@ -160,6 +161,7 @@ enum upcall_target_enum { }; enum cifs_reparse_type { + CIFS_REPARSE_TYPE_NONE, CIFS_REPARSE_TYPE_NFS, CIFS_REPARSE_TYPE_WSL, CIFS_REPARSE_TYPE_DEFAULT = CIFS_REPARSE_TYPE_NFS, @@ -168,6 +170,8 @@ enum cifs_reparse_type { static inline const char *cifs_reparse_type_str(enum cifs_reparse_type type) { switch (type) { + case CIFS_REPARSE_TYPE_NONE: + return "none"; case CIFS_REPARSE_TYPE_NFS: return "nfs"; case CIFS_REPARSE_TYPE_WSL: @@ -177,6 +181,39 @@ static inline const char *cifs_reparse_type_str(enum cifs_reparse_type type) } } +enum cifs_symlink_type { + CIFS_SYMLINK_TYPE_DEFAULT, + CIFS_SYMLINK_TYPE_NONE, + CIFS_SYMLINK_TYPE_NATIVE, + CIFS_SYMLINK_TYPE_UNIX, + CIFS_SYMLINK_TYPE_MFSYMLINKS, + CIFS_SYMLINK_TYPE_SFU, + CIFS_SYMLINK_TYPE_NFS, + CIFS_SYMLINK_TYPE_WSL, +}; + +static inline const char *cifs_symlink_type_str(enum cifs_symlink_type type) +{ + switch (type) { + case CIFS_SYMLINK_TYPE_NONE: + return "none"; + case CIFS_SYMLINK_TYPE_NATIVE: + return "native"; + case CIFS_SYMLINK_TYPE_UNIX: + return "unix"; + case CIFS_SYMLINK_TYPE_MFSYMLINKS: + return "mfsymlinks"; + case CIFS_SYMLINK_TYPE_SFU: + return "sfu"; + case CIFS_SYMLINK_TYPE_NFS: + return "nfs"; + case CIFS_SYMLINK_TYPE_WSL: + return "wsl"; + default: + return "unknown"; + } +} + struct session_key { unsigned int len; char *response; @@ -215,10 +252,8 @@ struct cifs_cred { struct cifs_open_info_data { bool adjust_tz; - union { - bool reparse_point; - bool symlink; - }; + bool reparse_point; + bool contains_posix_file_info; struct { /* ioctl response buffer */ struct { @@ -226,10 +261,7 @@ struct cifs_open_info_data { struct kvec iov; } io; __u32 tag; - union { - struct reparse_data_buffer *buf; - struct reparse_posix_data *posix; - }; + struct reparse_data_buffer *buf; } reparse; struct { __u8 eas[SMB2_WSL_MAX_QUERY_EA_RESP_SIZE]; @@ -326,7 +358,7 @@ struct smb_version_operations { int (*handle_cancelled_mid)(struct mid_q_entry *, struct TCP_Server_Info *); void (*downgrade_oplock)(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache); + __u16 epoch, bool *purge_cache); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -521,12 +553,12 @@ struct smb_version_operations { /* if we can do cache read operations */ bool (*is_read_op)(__u32); /* set oplock level for the inode */ - void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int, - bool *); + void (*set_oplock_level)(struct cifsInodeInfo *cinode, __u32 oplock, __u16 epoch, + bool *purge_cache); /* create lease context buffer for CREATE request */ char * (*create_lease_buf)(u8 *lease_key, u8 oplock); /* parse lease context buffer and return oplock/epoch info */ - __u8 (*parse_lease_buf)(void *buf, unsigned int *epoch, char *lkey); + __u8 (*parse_lease_buf)(void *buf, __u16 *epoch, char *lkey); ssize_t (*copychunk_range)(const unsigned int, struct cifsFileInfo *src_file, struct cifsFileInfo *target_file, @@ -751,6 +783,7 @@ struct TCP_Server_Info { bool sec_kerberosu2u; /* supports U2U Kerberos */ bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ + bool sec_iakerb; /* supports pass-through auth for Kerberos (krb5 proxy) */ bool large_buf; /* is current buffer large? */ /* use SMBD connection instead of socket */ bool rdma; @@ -1415,7 +1448,7 @@ struct cifs_fid { __u8 create_guid[16]; __u32 access; struct cifs_pending_open *pending_open; - unsigned int epoch; + __u16 epoch; #ifdef CONFIG_CIFS_DEBUG2 __u64 mid; #endif /* CIFS_DEBUG2 */ @@ -1448,7 +1481,7 @@ struct cifsFileInfo { bool oplock_break_cancelled:1; bool status_file_deleted:1; /* file has been deleted */ bool offload:1; /* offload final part of _put to a wq */ - unsigned int oplock_epoch; /* epoch from the lease break */ + __u16 oplock_epoch; /* epoch from the lease break */ __u32 oplock_level; /* oplock/lease level from the lease break */ int count; spinlock_t file_info_lock; /* protects four flag/count fields above */ @@ -1476,7 +1509,6 @@ struct cifs_io_parms { struct cifs_io_request { struct netfs_io_request rreq; struct cifsFileInfo *cfile; - struct TCP_Server_Info *server; pid_t pid; }; @@ -1545,7 +1577,7 @@ struct cifsInodeInfo { spinlock_t open_file_lock; /* protects openFileList */ __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ unsigned int oplock; /* oplock/lease level we have */ - unsigned int epoch; /* used to track lease state changes */ + __u16 epoch; /* used to track lease state changes */ #define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ #define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ #define CIFS_INODE_FLAG_UNUSED (2) /* Unused flag */ @@ -2118,6 +2150,8 @@ static inline char *get_security_type_str(enum securityEnum sectype) return "Kerberos"; case NTLMv2: return "NTLMv2"; + case IAKerb: + return "IAKerb"; default: return "Unknown"; } @@ -2173,11 +2207,13 @@ static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses) static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const FILE_ALL_INFO *src) { - memcpy(dst, src, (size_t)((u8 *)&src->AccessFlags - (u8 *)src)); - dst->AccessFlags = src->AccessFlags; - dst->CurrentByteOffset = src->CurrentByteOffset; - dst->Mode = src->Mode; - dst->AlignmentRequirement = src->AlignmentRequirement; + memcpy(dst, src, (size_t)((u8 *)&src->EASize - (u8 *)src)); + dst->IndexNumber = 0; + dst->EASize = src->EASize; + dst->AccessFlags = 0; + dst->CurrentByteOffset = 0; + dst->Mode = 0; + dst->AlignmentRequirement = 0; dst->FileNameLength = src->FileNameLength; } @@ -2289,8 +2325,8 @@ struct smb2_compound_vars { struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; struct kvec close_iov; - struct smb2_file_rename_info rename_info; - struct smb2_file_link_info link_info; + struct smb2_file_rename_info_hdr rename_info; + struct smb2_file_link_info_hdr link_info; struct kvec ea_iov; }; diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index 5c047b00516f..48d0d6f439cf 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -190,42 +190,82 @@ */ #define FILE_READ_DATA 0x00000001 /* Data can be read from the file */ + /* or directory child entries can */ + /* be listed together with the */ + /* associated child attributes */ + /* (so the FILE_READ_ATTRIBUTES on */ + /* the child entry is not needed) */ #define FILE_WRITE_DATA 0x00000002 /* Data can be written to the file */ + /* or new file can be created in */ + /* the directory */ #define FILE_APPEND_DATA 0x00000004 /* Data can be appended to the file */ + /* (for non-local files over SMB it */ + /* is same as FILE_WRITE_DATA) */ + /* or new subdirectory can be */ + /* created in the directory */ #define FILE_READ_EA 0x00000008 /* Extended attributes associated */ /* with the file can be read */ #define FILE_WRITE_EA 0x00000010 /* Extended attributes associated */ /* with the file can be written */ #define FILE_EXECUTE 0x00000020 /*Data can be read into memory from */ /* the file using system paging I/O */ -#define FILE_DELETE_CHILD 0x00000040 + /* for executing the file / script */ + /* or right to traverse directory */ + /* (but by default all users have */ + /* directory bypass traverse */ + /* privilege and do not need this */ + /* permission on directories at all)*/ +#define FILE_DELETE_CHILD 0x00000040 /* Child entry can be deleted from */ + /* the directory (so the DELETE on */ + /* the child entry is not needed) */ #define FILE_READ_ATTRIBUTES 0x00000080 /* Attributes associated with the */ - /* file can be read */ + /* file or directory can be read */ #define FILE_WRITE_ATTRIBUTES 0x00000100 /* Attributes associated with the */ - /* file can be written */ -#define DELETE 0x00010000 /* The file can be deleted */ -#define READ_CONTROL 0x00020000 /* The access control list and */ - /* ownership associated with the */ - /* file can be read */ -#define WRITE_DAC 0x00040000 /* The access control list and */ - /* ownership associated with the */ - /* file can be written. */ + /* file or directory can be written */ +#define DELETE 0x00010000 /* The file or dir can be deleted */ +#define READ_CONTROL 0x00020000 /* The discretionary access control */ + /* list and ownership associated */ + /* with the file or dir can be read */ +#define WRITE_DAC 0x00040000 /* The discretionary access control */ + /* list associated with the file or */ + /* directory can be written */ #define WRITE_OWNER 0x00080000 /* Ownership information associated */ - /* with the file can be written */ + /* with the file/dir can be written */ #define SYNCHRONIZE 0x00100000 /* The file handle can waited on to */ /* synchronize with the completion */ /* of an input/output request */ #define SYSTEM_SECURITY 0x01000000 /* The system access control list */ - /* can be read and changed */ -#define GENERIC_ALL 0x10000000 -#define GENERIC_EXECUTE 0x20000000 -#define GENERIC_WRITE 0x40000000 -#define GENERIC_READ 0x80000000 - /* In summary - Relevant file */ - /* access flags from CIFS are */ - /* file_read_data, file_write_data */ - /* file_execute, file_read_attributes*/ - /* write_dac, and delete. */ + /* associated with the file or */ + /* directory can be read or written */ + /* (cannot be in DACL, can in SACL) */ +#define MAXIMUM_ALLOWED 0x02000000 /* Maximal subset of GENERIC_ALL */ + /* permissions which can be granted */ + /* (cannot be in DACL nor SACL) */ +#define GENERIC_ALL 0x10000000 /* Same as: GENERIC_EXECUTE | */ + /* GENERIC_WRITE | */ + /* GENERIC_READ | */ + /* FILE_DELETE_CHILD | */ + /* DELETE | */ + /* WRITE_DAC | */ + /* WRITE_OWNER */ + /* So GENERIC_ALL contains all bits */ + /* mentioned above except these two */ + /* SYSTEM_SECURITY MAXIMUM_ALLOWED */ +#define GENERIC_EXECUTE 0x20000000 /* Same as: FILE_EXECUTE | */ + /* FILE_READ_ATTRIBUTES | */ + /* READ_CONTROL | */ + /* SYNCHRONIZE */ +#define GENERIC_WRITE 0x40000000 /* Same as: FILE_WRITE_DATA | */ + /* FILE_APPEND_DATA | */ + /* FILE_WRITE_EA | */ + /* FILE_WRITE_ATTRIBUTES | */ + /* READ_CONTROL | */ + /* SYNCHRONIZE */ +#define GENERIC_READ 0x80000000 /* Same as: FILE_READ_DATA | */ + /* FILE_READ_EA | */ + /* FILE_READ_ATTRIBUTES | */ + /* READ_CONTROL | */ + /* SYNCHRONIZE */ #define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_READ_ATTRIBUTES) #define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \ @@ -1484,20 +1524,6 @@ struct file_notify_information { __u8 FileName[]; } __attribute__((packed)); -/* For IO_REPARSE_TAG_NFS */ -#define NFS_SPECFILE_LNK 0x00000000014B4E4C -#define NFS_SPECFILE_CHR 0x0000000000524843 -#define NFS_SPECFILE_BLK 0x00000000004B4C42 -#define NFS_SPECFILE_FIFO 0x000000004F464946 -#define NFS_SPECFILE_SOCK 0x000000004B434F53 -struct reparse_posix_data { - __le32 ReparseTag; - __le16 ReparseDataLength; - __u16 Reserved; - __le64 InodeType; /* LNK, FIFO, CHR etc. */ - __u8 DataBuffer[]; -} __attribute__((packed)); - struct cifs_quota_data { __u32 rsrvd1; /* 0 */ __u32 sid_size; @@ -2264,13 +2290,7 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */ __u8 DeletePending; __u8 Directory; __u16 Pad2; - __le64 IndexNumber; __le32 EASize; - __le32 AccessFlags; - __u64 IndexNumber1; - __le64 CurrentByteOffset; - __le32 Mode; - __le32 AlignmentRequirement; __le32 FileNameLength; union { char __pad; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 223e5e231f42..81680001944d 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -557,7 +557,7 @@ extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nls_codepage, struct cifs_sb_info *cifs_sb); extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, - __u16 fid, struct smb_ntsd **acl_inf, __u32 *buflen); + __u16 fid, struct smb_ntsd **acl_inf, __u32 *buflen, __u32 info); extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16, struct smb_ntsd *pntsd, __u32 len, int aclflag); extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, @@ -656,7 +656,7 @@ char *extract_sharename(const char *unc); int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, const char *full_path, - bool unicode, struct cifs_open_info_data *data); + struct cifs_open_info_data *data); int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 7f1cacc89dbb..3feaa0f68169 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -3369,7 +3369,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata, /* Get Security Descriptor (by handle) from remote server for a file or dir */ int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, - struct smb_ntsd **acl_inf, __u32 *pbuflen) + struct smb_ntsd **acl_inf, __u32 *pbuflen, __u32 info) { int rc = 0; int buf_type = 0; @@ -3392,7 +3392,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, pSMB->MaxSetupCount = 0; pSMB->Fid = fid; /* file handle always le */ pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP | - CIFS_ACL_DACL); + CIFS_ACL_DACL | info); pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */ inc_rfc1001_len(pSMB, 11); iov[0].iov_base = (char *)pSMB; diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 880d7cf8b730..f917de020dd5 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2849,6 +2849,10 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) return 0; if (old->ctx->reparse_type != new->ctx->reparse_type) return 0; + if (old->ctx->nonativesocket != new->ctx->nonativesocket) + return 0; + if (old->ctx->symlink_type != new->ctx->symlink_type) + return 0; return 1; } diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index dad521336b5e..f65a8a90ba27 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -150,25 +150,27 @@ again: if (rc) continue; - if (tgt.flags & DFSREF_STORAGE_SERVER) { - rc = cifs_mount_get_tcon(mnt_ctx); - if (!rc) - rc = cifs_is_path_remote(mnt_ctx); + rc = cifs_mount_get_tcon(mnt_ctx); + if (rc) { + if (tgt.server_type == DFS_TYPE_LINK && + DFS_INTERLINK(tgt.flags)) + rc = -EREMOTE; + } else { + rc = cifs_is_path_remote(mnt_ctx); if (!rc) { ref_walk_set_tgt_hint(rw); break; } - if (rc != -EREMOTE) - continue; } - - rc = ref_walk_advance(rw); - if (!rc) { - rc = setup_dfs_ref(&tgt, rw); - if (rc) - break; - ref_walk_mark_end(rw); - goto again; + if (rc == -EREMOTE) { + rc = ref_walk_advance(rw); + if (!rc) { + rc = setup_dfs_ref(&tgt, rw); + if (rc) + break; + ref_walk_mark_end(rw); + goto again; + } } } } while (rc && ref_walk_descend(rw)); diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h index ed4cd7cf1ec6..e60f0a24a8a1 100644 --- a/fs/smb/client/dfs.h +++ b/fs/smb/client/dfs.h @@ -188,4 +188,11 @@ static inline void dfs_put_root_smb_sessions(struct list_head *head) } } +static inline const char *dfs_ses_refpath(struct cifs_ses *ses) +{ + const char *path = ses->server->leaf_fullpath; + + return path ? path + 1 : ERR_PTR(-ENOENT); +} + #endif /* _CIFS_DFS_H */ diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 5022bb1f122a..4dada26d56b5 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1136,33 +1136,19 @@ static bool is_ses_good(struct cifs_ses *ses) return ret; } -static char *get_ses_refpath(struct cifs_ses *ses) -{ - struct TCP_Server_Info *server = ses->server; - char *path = ERR_PTR(-ENOENT); - - if (server->leaf_fullpath) { - path = kstrdup(server->leaf_fullpath + 1, GFP_KERNEL); - if (!path) - path = ERR_PTR(-ENOMEM); - } - return path; -} - /* Refresh dfs referral of @ses */ static void refresh_ses_referral(struct cifs_ses *ses) { struct cache_entry *ce; unsigned int xid; - char *path; + const char *path; int rc = 0; xid = get_xid(); - path = get_ses_refpath(ses); + path = dfs_ses_refpath(ses); if (IS_ERR(path)) { rc = PTR_ERR(path); - path = NULL; goto out; } @@ -1181,7 +1167,6 @@ static void refresh_ses_referral(struct cifs_ses *ses) out: free_xid(xid); - kfree(path); } static int __refresh_tcon_referral(struct cifs_tcon *tcon, @@ -1231,19 +1216,18 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh) struct dfs_info3_param *refs = NULL; struct cache_entry *ce; struct cifs_ses *ses; - unsigned int xid; bool needs_refresh; - char *path; + const char *path; + unsigned int xid; int numrefs = 0; int rc = 0; xid = get_xid(); ses = tcon->ses; - path = get_ses_refpath(ses); + path = dfs_ses_refpath(ses); if (IS_ERR(path)) { rc = PTR_ERR(path); - path = NULL; goto out; } @@ -1271,7 +1255,6 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh) out: free_xid(xid); - kfree(path); free_dfs_info_array(refs, numrefs); } diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index 1822493dd084..d1e95632ac54 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -737,7 +737,8 @@ again: } static int -cifs_d_revalidate(struct dentry *direntry, unsigned int flags) +cifs_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *direntry, unsigned int flags) { struct inode *inode; int rc; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 79de2f2f9c41..8582cf61242c 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -147,7 +147,7 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct TCP_Server_Info *server = req->server; + struct TCP_Server_Info *server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); size_t size; int rc = 0; @@ -156,6 +156,8 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) rdata->xid = get_xid(); rdata->have_xid = true; } + + server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; if (cifs_sb->ctx->rsize == 0) @@ -198,7 +200,7 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct TCP_Server_Info *server = req->server; + struct TCP_Server_Info *server = rdata->server; int rc = 0; cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", @@ -266,7 +268,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) open_file = file->private_data; rreq->netfs_priv = file->private_data; req->cfile = cifsFileInfo_get(open_file); - req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) req->pid = req->cfile->pid; } else if (rreq->origin != NETFS_WRITEBACK) { diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 5381f05420bc..e9b286d9a7ba 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -133,6 +133,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_flag("rootfs", Opt_rootfs), fsparam_flag("compress", Opt_compress), fsparam_flag("witness", Opt_witness), + fsparam_flag_no("nativesocket", Opt_nativesocket), /* Mount options which take uid or gid */ fsparam_uid("backupuid", Opt_backupuid), @@ -185,6 +186,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_string("cache", Opt_cache), fsparam_string("reparse", Opt_reparse), fsparam_string("upcall_target", Opt_upcalltarget), + fsparam_string("symlink", Opt_symlink), + fsparam_string("symlinkroot", Opt_symlinkroot), /* Arguments that should be ignored */ fsparam_flag("guest", Opt_ignore), @@ -332,6 +335,7 @@ cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_conte static const match_table_t reparse_flavor_tokens = { { Opt_reparse_default, "default" }, + { Opt_reparse_none, "none" }, { Opt_reparse_nfs, "nfs" }, { Opt_reparse_wsl, "wsl" }, { Opt_reparse_err, NULL }, @@ -346,6 +350,9 @@ static int parse_reparse_flavor(struct fs_context *fc, char *value, case Opt_reparse_default: ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT; break; + case Opt_reparse_none: + ctx->reparse_type = CIFS_REPARSE_TYPE_NONE; + break; case Opt_reparse_nfs: ctx->reparse_type = CIFS_REPARSE_TYPE_NFS; break; @@ -359,6 +366,55 @@ static int parse_reparse_flavor(struct fs_context *fc, char *value, return 0; } +static const match_table_t symlink_flavor_tokens = { + { Opt_symlink_default, "default" }, + { Opt_symlink_none, "none" }, + { Opt_symlink_native, "native" }, + { Opt_symlink_unix, "unix" }, + { Opt_symlink_mfsymlinks, "mfsymlinks" }, + { Opt_symlink_sfu, "sfu" }, + { Opt_symlink_nfs, "nfs" }, + { Opt_symlink_wsl, "wsl" }, + { Opt_symlink_err, NULL }, +}; + +static int parse_symlink_flavor(struct fs_context *fc, char *value, + struct smb3_fs_context *ctx) +{ + substring_t args[MAX_OPT_ARGS]; + + switch (match_token(value, symlink_flavor_tokens, args)) { + case Opt_symlink_default: + ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT; + break; + case Opt_symlink_none: + ctx->symlink_type = CIFS_SYMLINK_TYPE_NONE; + break; + case Opt_symlink_native: + ctx->symlink_type = CIFS_SYMLINK_TYPE_NATIVE; + break; + case Opt_symlink_unix: + ctx->symlink_type = CIFS_SYMLINK_TYPE_UNIX; + break; + case Opt_symlink_mfsymlinks: + ctx->symlink_type = CIFS_SYMLINK_TYPE_MFSYMLINKS; + break; + case Opt_symlink_sfu: + ctx->symlink_type = CIFS_SYMLINK_TYPE_SFU; + break; + case Opt_symlink_nfs: + ctx->symlink_type = CIFS_SYMLINK_TYPE_NFS; + break; + case Opt_symlink_wsl: + ctx->symlink_type = CIFS_SYMLINK_TYPE_WSL; + break; + default: + cifs_errorf(fc, "bad symlink= option: %s\n", value); + return 1; + } + return 0; +} + #define DUP_CTX_STR(field) \ do { \ if (ctx->field) { \ @@ -386,6 +442,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx new_ctx->iocharset = NULL; new_ctx->leaf_fullpath = NULL; new_ctx->dns_dom = NULL; + new_ctx->symlinkroot = NULL; /* * Make sure to stay in sync with smb3_cleanup_fs_context_contents() */ @@ -401,6 +458,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(iocharset); DUP_CTX_STR(leaf_fullpath); DUP_CTX_STR(dns_dom); + DUP_CTX_STR(symlinkroot); return 0; } @@ -1727,6 +1785,23 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, if (parse_reparse_flavor(fc, param->string, ctx)) goto cifs_parse_mount_err; break; + case Opt_nativesocket: + ctx->nonativesocket = result.negated; + break; + case Opt_symlink: + if (parse_symlink_flavor(fc, param->string, ctx)) + goto cifs_parse_mount_err; + break; + case Opt_symlinkroot: + if (param->string[0] != '/') { + cifs_errorf(fc, "symlinkroot mount options must be absolute path\n"); + goto cifs_parse_mount_err; + } + kfree(ctx->symlinkroot); + ctx->symlinkroot = kstrdup(param->string, GFP_KERNEL); + if (!ctx->symlinkroot) + goto cifs_parse_mount_err; + break; } /* case Opt_ignore: - is ignored as expected ... */ @@ -1735,6 +1810,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; } + /* + * By default resolve all native absolute symlinks relative to "/mnt/". + * Same default has drvfs driver running in WSL for resolving SMB shares. + */ + if (!ctx->symlinkroot) + ctx->symlinkroot = kstrdup("/mnt/", GFP_KERNEL); + return 0; cifs_parse_mount_err: @@ -1745,6 +1827,24 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, return -EINVAL; } +enum cifs_symlink_type get_cifs_symlink_type(struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->ctx->symlink_type == CIFS_SYMLINK_TYPE_DEFAULT) { + if (cifs_sb->ctx->mfsymlinks) + return CIFS_SYMLINK_TYPE_MFSYMLINKS; + else if (cifs_sb->ctx->sfu_emul) + return CIFS_SYMLINK_TYPE_SFU; + else if (cifs_sb->ctx->linux_ext && !cifs_sb->ctx->no_linux_ext) + return CIFS_SYMLINK_TYPE_UNIX; + else if (cifs_sb->ctx->reparse_type != CIFS_REPARSE_TYPE_NONE) + return CIFS_SYMLINK_TYPE_NATIVE; + else + return CIFS_SYMLINK_TYPE_NONE; + } else { + return cifs_sb->ctx->symlink_type; + } +} + int smb3_init_fs_context(struct fs_context *fc) { struct smb3_fs_context *ctx; @@ -1821,6 +1921,8 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->retrans = 1; ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT; + ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT; + ctx->nonativesocket = 0; /* * short int override_uid = -1; @@ -1867,6 +1969,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) ctx->leaf_fullpath = NULL; kfree(ctx->dns_dom); ctx->dns_dom = NULL; + kfree(ctx->symlinkroot); + ctx->symlinkroot = NULL; } void diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 8813533345ee..881bfc08667e 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -43,11 +43,24 @@ enum { enum cifs_reparse_parm { Opt_reparse_default, + Opt_reparse_none, Opt_reparse_nfs, Opt_reparse_wsl, Opt_reparse_err }; +enum cifs_symlink_parm { + Opt_symlink_default, + Opt_symlink_none, + Opt_symlink_native, + Opt_symlink_unix, + Opt_symlink_mfsymlinks, + Opt_symlink_sfu, + Opt_symlink_nfs, + Opt_symlink_wsl, + Opt_symlink_err +}; + enum cifs_sec_param { Opt_sec_krb5, Opt_sec_krb5i, @@ -166,6 +179,9 @@ enum cifs_param { Opt_cache, Opt_reparse, Opt_upcalltarget, + Opt_nativesocket, + Opt_symlink, + Opt_symlinkroot, /* Mount options to be ignored */ Opt_ignore, @@ -294,12 +310,17 @@ struct smb3_fs_context { struct cifs_ses *dfs_root_ses; bool dfs_automount:1; /* set for dfs automount only */ enum cifs_reparse_type reparse_type; + enum cifs_symlink_type symlink_type; + bool nonativesocket:1; bool dfs_conn:1; /* set for dfs mounts */ char *dns_dom; + char *symlinkroot; /* top level directory for native SMB symlinks in absolute format */ }; extern const struct fs_parameter_spec smb3_fs_parameters[]; +extern enum cifs_symlink_type get_cifs_symlink_type(struct cifs_sb_info *cifs_sb); + extern int smb3_init_fs_context(struct fs_context *fc); extern void smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx); extern void smb3_cleanup_fs_context(struct smb3_fs_context *ctx); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 93e9188b2632..616149c7f0a5 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -990,7 +990,7 @@ cifs_get_file_info(struct file *filp) /* TODO: add support to query reparse tag */ data.adjust_tz = false; if (data.symlink_target) { - data.symlink = true; + data.reparse_point = true; data.reparse.tag = IO_REPARSE_TAG_SYMLINK; } path = build_path_from_dentry(dentry, page); @@ -1215,6 +1215,24 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, rc = server->ops->parse_reparse_point(cifs_sb, full_path, iov, data); + /* + * If the reparse point was not handled but it is the + * name surrogate which points to directory, then treat + * is as a new mount point. Name surrogate reparse point + * represents another named entity in the system. + */ + if (rc == -EOPNOTSUPP && + IS_REPARSE_TAG_NAME_SURROGATE(data->reparse.tag) && + (le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY)) { + rc = 0; + cifs_create_junction_fattr(fattr, sb); + goto out; + } + } + + if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) { + bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY; + rc = smb2_fix_symlink_target_type(&data->symlink_target, directory, cifs_sb); } break; } @@ -1403,7 +1421,7 @@ int cifs_get_inode_info(struct inode **inode, struct cifs_fattr fattr = {}; int rc; - if (is_inode_cache_good(*inode)) { + if (!data && is_inode_cache_good(*inode)) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); return 0; } @@ -1502,7 +1520,7 @@ int smb311_posix_get_inode_info(struct inode **inode, struct cifs_fattr fattr = {}; int rc; - if (is_inode_cache_good(*inode)) { + if (!data && is_inode_cache_good(*inode)) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); return 0; } diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index 47ddeb7fa111..6e6c09cc5ce7 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -18,6 +18,7 @@ #include "cifs_unicode.h" #include "smb2proto.h" #include "cifs_ioctl.h" +#include "fs_context.h" /* * M-F Symlink Functions - Begin @@ -604,22 +605,53 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, cifs_dbg(FYI, "symname is %s\n", symname); /* BB what if DFS and this volume is on different share? BB */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); - } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { - rc = __cifs_sfu_make_node(xid, inode, direntry, pTcon, - full_path, S_IFLNK, 0, symname); + rc = -EOPNOTSUPP; + switch (get_cifs_symlink_type(cifs_sb)) { + case CIFS_SYMLINK_TYPE_DEFAULT: + /* should not happen, get_cifs_symlink_type() resolves the default */ + break; + + case CIFS_SYMLINK_TYPE_NONE: + break; + + case CIFS_SYMLINK_TYPE_UNIX: #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - } else if (pTcon->unix_ext) { - rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); + if (pTcon->unix_ext) { + rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, + symname, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - } else if (server->ops->create_reparse_symlink) { - rc = server->ops->create_reparse_symlink(xid, inode, direntry, - pTcon, full_path, - symname); - goto symlink_exit; + break; + + case CIFS_SYMLINK_TYPE_MFSYMLINKS: + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { + rc = create_mf_symlink(xid, pTcon, cifs_sb, + full_path, symname); + } + break; + + case CIFS_SYMLINK_TYPE_SFU: + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + rc = __cifs_sfu_make_node(xid, inode, direntry, pTcon, + full_path, S_IFLNK, + 0, symname); + } + break; + + case CIFS_SYMLINK_TYPE_NATIVE: + case CIFS_SYMLINK_TYPE_NFS: + case CIFS_SYMLINK_TYPE_WSL: + if (server->ops->create_reparse_symlink) { + rc = server->ops->create_reparse_symlink(xid, inode, + direntry, + pTcon, + full_path, + symname); + goto symlink_exit; + } + break; } if (rc == 0) { diff --git a/fs/smb/client/netmisc.c b/fs/smb/client/netmisc.c index 17b3e21ea868..9ec20601cee2 100644 --- a/fs/smb/client/netmisc.c +++ b/fs/smb/client/netmisc.c @@ -313,7 +313,6 @@ static const struct { ERRDOS, 2215, NT_STATUS_NO_LOGON_SERVERS}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_LOGON_SESSION}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PRIVILEGE}, { - ERRDOS, ERRnoaccess, NT_STATUS_PRIVILEGE_NOT_HELD}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACCOUNT_NAME}, { ERRHRD, ERRgeneral, NT_STATUS_USER_EXISTS}, /* { This NT error code was 'sqashed' @@ -871,6 +870,15 @@ map_smb_to_linux_error(char *buf, bool logErr) } /* else ERRHRD class errors or junk - return EIO */ + /* special cases for NT status codes which cannot be translated to DOS codes */ + if (smb->Flags2 & SMBFLG2_ERR_STATUS) { + __u32 err = le32_to_cpu(smb->Status.CifsError); + if (err == (NT_STATUS_NOT_A_REPARSE_POINT)) + rc = -ENODATA; + else if (err == (NT_STATUS_PRIVILEGE_NOT_HELD)) + rc = -EPERM; + } + cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n", le32_to_cpu(smb->Status.CifsError), rc); diff --git a/fs/smb/client/nterr.c b/fs/smb/client/nterr.c index d396a8e98a81..8f0bc441295e 100644 --- a/fs/smb/client/nterr.c +++ b/fs/smb/client/nterr.c @@ -674,6 +674,7 @@ const struct nt_err_code_struct nt_errs[] = { {"NT_STATUS_QUOTA_LIST_INCONSISTENT", NT_STATUS_QUOTA_LIST_INCONSISTENT}, {"NT_STATUS_FILE_IS_OFFLINE", NT_STATUS_FILE_IS_OFFLINE}, + {"NT_STATUS_NOT_A_REPARSE_POINT", NT_STATUS_NOT_A_REPARSE_POINT}, {"NT_STATUS_NO_MORE_ENTRIES", NT_STATUS_NO_MORE_ENTRIES}, {"NT_STATUS_MORE_ENTRIES", NT_STATUS_MORE_ENTRIES}, {"NT_STATUS_SOME_UNMAPPED", NT_STATUS_SOME_UNMAPPED}, diff --git a/fs/smb/client/nterr.h b/fs/smb/client/nterr.h index edd4741cab0a..180602c22355 100644 --- a/fs/smb/client/nterr.h +++ b/fs/smb/client/nterr.h @@ -546,6 +546,7 @@ extern const struct nt_err_code_struct nt_errs[]; #define NT_STATUS_TOO_MANY_LINKS 0xC0000000 | 0x0265 #define NT_STATUS_QUOTA_LIST_INCONSISTENT 0xC0000000 | 0x0266 #define NT_STATUS_FILE_IS_OFFLINE 0xC0000000 | 0x0267 +#define NT_STATUS_NOT_A_REPARSE_POINT 0xC0000000 | 0x0275 #define NT_STATUS_NO_SUCH_JOB 0xC0000000 | 0xEDE /* scheduler */ #endif /* _NTERR_H */ diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index d88b41133e00..2b9e9885dc42 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -14,6 +14,20 @@ #include "fs_context.h" #include "reparse.h" +static int mknod_nfs(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev, + const char *symname); + +static int mknod_wsl(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev, + const char *symname); + +static int create_native_symlink(const unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, const char *symname); + static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb, const unsigned int xid, const char *full_path, @@ -24,37 +38,148 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, const char *symname) { + switch (get_cifs_symlink_type(CIFS_SB(inode->i_sb))) { + case CIFS_SYMLINK_TYPE_NATIVE: + return create_native_symlink(xid, inode, dentry, tcon, full_path, symname); + case CIFS_SYMLINK_TYPE_NFS: + return mknod_nfs(xid, inode, dentry, tcon, full_path, S_IFLNK, 0, symname); + case CIFS_SYMLINK_TYPE_WSL: + return mknod_wsl(xid, inode, dentry, tcon, full_path, S_IFLNK, 0, symname); + default: + return -EOPNOTSUPP; + } +} + +static int create_native_symlink(const unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, const char *symname) +{ struct reparse_symlink_data_buffer *buf = NULL; - struct cifs_open_info_data data; + struct cifs_open_info_data data = {}; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct inode *new; struct kvec iov; - __le16 *path; + __le16 *path = NULL; bool directory; - char *sym, sep = CIFS_DIR_SEP(cifs_sb); - u16 len, plen; + char *symlink_target = NULL; + char *sym = NULL; + char sep = CIFS_DIR_SEP(cifs_sb); + u16 len, plen, poff, slen; int rc = 0; if (strlen(symname) > REPARSE_SYM_PATH_MAX) return -ENAMETOOLONG; - sym = kstrdup(symname, GFP_KERNEL); - if (!sym) - return -ENOMEM; + symlink_target = kstrdup(symname, GFP_KERNEL); + if (!symlink_target) { + rc = -ENOMEM; + goto out; + } data = (struct cifs_open_info_data) { .reparse_point = true, .reparse = { .tag = IO_REPARSE_TAG_SYMLINK, }, - .symlink_target = sym, + .symlink_target = symlink_target, }; - convert_delimiter(sym, sep); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') { + /* + * This is a request to create an absolute symlink on the server + * which does not support POSIX paths, and expects symlink in + * NT-style path. So convert absolute Linux symlink target path + * to the absolute NT-style path. Root of the NT-style path for + * symlinks is specified in "symlinkroot" mount option. This will + * ensure compatibility of this symlink stored in absolute form + * on the SMB server. + */ + if (!strstarts(symname, cifs_sb->ctx->symlinkroot)) { + /* + * If the absolute Linux symlink target path is not + * inside "symlinkroot" location then there is no way + * to convert such Linux symlink to NT-style path. + */ + cifs_dbg(VFS, + "absolute symlink '%s' cannot be converted to NT format " + "because it is outside of symlinkroot='%s'\n", + symname, cifs_sb->ctx->symlinkroot); + rc = -EINVAL; + goto out; + } + len = strlen(cifs_sb->ctx->symlinkroot); + if (cifs_sb->ctx->symlinkroot[len-1] != '/') + len++; + if (symname[len] >= 'a' && symname[len] <= 'z' && + (symname[len+1] == '/' || symname[len+1] == '\0')) { + /* + * Symlink points to Linux target /symlinkroot/x/path/... + * where 'x' is the lowercase local Windows drive. + * NT-style path for 'x' has common form \??\X:\path\... + * with uppercase local Windows drive. + */ + int common_path_len = strlen(symname+len+1)+1; + sym = kzalloc(6+common_path_len, GFP_KERNEL); + if (!sym) { + rc = -ENOMEM; + goto out; + } + memcpy(sym, "\\??\\", 4); + sym[4] = symname[len] - ('a'-'A'); + sym[5] = ':'; + memcpy(sym+6, symname+len+1, common_path_len); + } else { + /* Unhandled absolute symlink. Report an error. */ + cifs_dbg( + VFS, + "absolute symlink '%s' cannot be converted to NT format " + "because it points to unknown target\n", + symname); + rc = -EINVAL; + goto out; + } + } else { + /* + * This is request to either create an absolute symlink on + * server which expects POSIX paths or it is an request to + * create a relative symlink from the current directory. + * These paths have same format as relative SMB symlinks, + * so no conversion is needed. So just take symname as-is. + */ + sym = kstrdup(symname, GFP_KERNEL); + if (!sym) { + rc = -ENOMEM; + goto out; + } + } + + if (sep == '\\') + convert_delimiter(sym, sep); + + /* + * For absolute NT symlinks it is required to pass also leading + * backslash and to not mangle NT object prefix "\\??\\" and not to + * mangle colon in drive letter. But cifs_convert_path_to_utf16() + * removes leading backslash and replaces '?' and ':'. So temporary + * mask these characters in NT object prefix by '_' and then change + * them back. + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') + sym[0] = sym[1] = sym[2] = sym[5] = '_'; + path = cifs_convert_path_to_utf16(sym, cifs_sb); if (!path) { rc = -ENOMEM; goto out; } + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') { + sym[0] = '\\'; + sym[1] = sym[2] = '?'; + sym[5] = ':'; + path[0] = cpu_to_le16('\\'); + path[1] = path[2] = cpu_to_le16('?'); + path[5] = cpu_to_le16(':'); + } + /* * SMB distinguish between symlink to directory and symlink to file. * They cannot be exchanged (symlink of file type which points to @@ -67,8 +192,18 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, if (rc < 0) goto out; - plen = 2 * UniStrnlen((wchar_t *)path, REPARSE_SYM_PATH_MAX); - len = sizeof(*buf) + plen * 2; + slen = 2 * UniStrnlen((wchar_t *)path, REPARSE_SYM_PATH_MAX); + poff = 0; + plen = slen; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') { + /* + * For absolute NT symlinks skip leading "\\??\\" in PrintName as + * PrintName is user visible location in DOS/Win32 format (not in NT format). + */ + poff = 4; + plen -= 2 * poff; + } + len = sizeof(*buf) + plen + slen; buf = kzalloc(len, GFP_KERNEL); if (!buf) { rc = -ENOMEM; @@ -77,17 +212,17 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK); buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer)); + buf->SubstituteNameOffset = cpu_to_le16(plen); - buf->SubstituteNameLength = cpu_to_le16(plen); - memcpy(&buf->PathBuffer[plen], path, plen); + buf->SubstituteNameLength = cpu_to_le16(slen); + memcpy(&buf->PathBuffer[plen], path, slen); + buf->PrintNameOffset = 0; buf->PrintNameLength = cpu_to_le16(plen); - memcpy(buf->PathBuffer, path, plen); + memcpy(buf->PathBuffer, path+poff, plen); + buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0); - if (*sym != sep) - buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE); - convert_delimiter(sym, '/'); iov.iov_base = buf; iov.iov_len = len; new = smb2_get_reparse_inode(&data, inode->i_sb, xid, @@ -98,6 +233,7 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, else rc = PTR_ERR(new); out: + kfree(sym); kfree(path); cifs_free_open_info(&data); kfree(buf); @@ -242,8 +378,39 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb, return 0; } -static int nfs_set_reparse_buf(struct reparse_posix_data *buf, +static int create_native_socket(const unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path) +{ + struct reparse_data_buffer buf = { + .ReparseTag = cpu_to_le32(IO_REPARSE_TAG_AF_UNIX), + .ReparseDataLength = cpu_to_le16(0), + }; + struct cifs_open_info_data data = { + .reparse_point = true, + .reparse = { .tag = IO_REPARSE_TAG_AF_UNIX, .buf = &buf, }, + }; + struct kvec iov = { + .iov_base = &buf, + .iov_len = sizeof(buf), + }; + struct inode *new; + int rc = 0; + + new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + tcon, full_path, false, &iov, NULL); + if (!IS_ERR(new)) + d_instantiate(dentry, new); + else + rc = PTR_ERR(new); + cifs_free_open_info(&data); + return rc; +} + +static int nfs_set_reparse_buf(struct reparse_nfs_data_buffer *buf, mode_t mode, dev_t dev, + __le16 *symname_utf16, + int symname_utf16_len, struct kvec *iov) { u64 type; @@ -254,7 +421,13 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf, switch ((type = reparse_mode_nfs_type(mode))) { case NFS_SPECFILE_BLK: case NFS_SPECFILE_CHR: - dlen = sizeof(__le64); + dlen = 2 * sizeof(__le32); + ((__le32 *)buf->DataBuffer)[0] = cpu_to_le32(MAJOR(dev)); + ((__le32 *)buf->DataBuffer)[1] = cpu_to_le32(MINOR(dev)); + break; + case NFS_SPECFILE_LNK: + dlen = symname_utf16_len; + memcpy(buf->DataBuffer, symname_utf16, symname_utf16_len); break; case NFS_SPECFILE_FIFO: case NFS_SPECFILE_SOCK: @@ -269,8 +442,6 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf, buf->InodeType = cpu_to_le64(type); buf->ReparseDataLength = cpu_to_le16(len + dlen - sizeof(struct reparse_data_buffer)); - *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MINOR(dev) << 32) | - MAJOR(dev)); iov->iov_base = buf; iov->iov_len = len + dlen; return 0; @@ -278,23 +449,45 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf, static int mknod_nfs(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, umode_t mode, dev_t dev) + const char *full_path, umode_t mode, dev_t dev, + const char *symname) { + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_open_info_data data; - struct reparse_posix_data *p; + struct reparse_nfs_data_buffer *p = NULL; + __le16 *symname_utf16 = NULL; + int symname_utf16_len = 0; struct inode *new; struct kvec iov; __u8 buf[sizeof(*p) + sizeof(__le64)]; int rc; - p = (struct reparse_posix_data *)buf; - rc = nfs_set_reparse_buf(p, mode, dev, &iov); + if (S_ISLNK(mode)) { + symname_utf16 = cifs_strndup_to_utf16(symname, strlen(symname), + &symname_utf16_len, + cifs_sb->local_nls, + NO_MAP_UNI_RSVD); + if (!symname_utf16) { + rc = -ENOMEM; + goto out; + } + symname_utf16_len -= 2; /* symlink is without trailing wide-nul */ + p = kzalloc(sizeof(*p) + symname_utf16_len, GFP_KERNEL); + if (!p) { + rc = -ENOMEM; + goto out; + } + } else { + p = (struct reparse_nfs_data_buffer *)buf; + } + rc = nfs_set_reparse_buf(p, mode, dev, symname_utf16, symname_utf16_len, &iov); if (rc) - return rc; + goto out; data = (struct cifs_open_info_data) { .reparse_point = true, - .reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, }, + .reparse = { .tag = IO_REPARSE_TAG_NFS, .buf = (struct reparse_data_buffer *)p, }, + .symlink_target = kstrdup(symname, GFP_KERNEL), }; new = smb2_get_reparse_inode(&data, inode->i_sb, xid, @@ -304,12 +497,25 @@ static int mknod_nfs(unsigned int xid, struct inode *inode, else rc = PTR_ERR(new); cifs_free_open_info(&data); +out: + if (S_ISLNK(mode)) { + kfree(symname_utf16); + kfree(p); + } return rc; } -static int wsl_set_reparse_buf(struct reparse_data_buffer *buf, - mode_t mode, struct kvec *iov) +static int wsl_set_reparse_buf(struct reparse_data_buffer **buf, + mode_t mode, const char *symname, + struct cifs_sb_info *cifs_sb, + struct kvec *iov) { + struct reparse_wsl_symlink_data_buffer *symlink_buf; + __le16 *symname_utf16; + int symname_utf16_len; + int symname_utf8_maxlen; + int symname_utf8_len; + size_t buf_len; u32 tag; switch ((tag = reparse_mode_wsl_tag(mode))) { @@ -317,16 +523,45 @@ static int wsl_set_reparse_buf(struct reparse_data_buffer *buf, case IO_REPARSE_TAG_LX_CHR: case IO_REPARSE_TAG_LX_FIFO: case IO_REPARSE_TAG_AF_UNIX: + buf_len = sizeof(struct reparse_data_buffer); + *buf = kzalloc(buf_len, GFP_KERNEL); + if (!*buf) + return -ENOMEM; + break; + case IO_REPARSE_TAG_LX_SYMLINK: + symname_utf16 = cifs_strndup_to_utf16(symname, strlen(symname), + &symname_utf16_len, + cifs_sb->local_nls, + NO_MAP_UNI_RSVD); + if (!symname_utf16) + return -ENOMEM; + symname_utf8_maxlen = symname_utf16_len/2*3; + symlink_buf = kzalloc(sizeof(struct reparse_wsl_symlink_data_buffer) + + symname_utf8_maxlen, GFP_KERNEL); + if (!symlink_buf) { + kfree(symname_utf16); + return -ENOMEM; + } + /* Flag 0x02000000 is unknown, but all wsl symlinks have this value */ + symlink_buf->Flags = cpu_to_le32(0x02000000); + /* PathBuffer is in UTF-8 but without trailing null-term byte */ + symname_utf8_len = utf16s_to_utf8s((wchar_t *)symname_utf16, symname_utf16_len/2, + UTF16_LITTLE_ENDIAN, + symlink_buf->PathBuffer, + symname_utf8_maxlen); + *buf = (struct reparse_data_buffer *)symlink_buf; + buf_len = sizeof(struct reparse_wsl_symlink_data_buffer) + symname_utf8_len; + kfree(symname_utf16); break; default: return -EOPNOTSUPP; } - buf->ReparseTag = cpu_to_le32(tag); - buf->Reserved = 0; - buf->ReparseDataLength = 0; - iov->iov_base = buf; - iov->iov_len = sizeof(*buf); + (*buf)->ReparseTag = cpu_to_le32(tag); + (*buf)->Reserved = 0; + (*buf)->ReparseDataLength = cpu_to_le16(buf_len - sizeof(struct reparse_data_buffer)); + iov->iov_base = *buf; + iov->iov_len = buf_len; return 0; } @@ -415,27 +650,32 @@ static int wsl_set_xattrs(struct inode *inode, umode_t _mode, static int mknod_wsl(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, umode_t mode, dev_t dev) + const char *full_path, umode_t mode, dev_t dev, + const char *symname) { + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_open_info_data data; - struct reparse_data_buffer buf; + struct reparse_data_buffer *buf; struct smb2_create_ea_ctx *cc; struct inode *new; unsigned int len; struct kvec reparse_iov, xattr_iov; int rc; - rc = wsl_set_reparse_buf(&buf, mode, &reparse_iov); + rc = wsl_set_reparse_buf(&buf, mode, symname, cifs_sb, &reparse_iov); if (rc) return rc; rc = wsl_set_xattrs(inode, mode, dev, &xattr_iov); - if (rc) + if (rc) { + kfree(buf); return rc; + } data = (struct cifs_open_info_data) { .reparse_point = true, - .reparse = { .tag = le32_to_cpu(buf.ReparseTag), .buf = &buf, }, + .reparse = { .tag = le32_to_cpu(buf->ReparseTag), .buf = buf, }, + .symlink_target = kstrdup(symname, GFP_KERNEL), }; cc = xattr_iov.iov_base; @@ -452,6 +692,7 @@ static int mknod_wsl(unsigned int xid, struct inode *inode, rc = PTR_ERR(new); cifs_free_open_info(&data); kfree(xattr_iov.iov_base); + kfree(buf); return rc; } @@ -460,21 +701,22 @@ int smb2_mknod_reparse(unsigned int xid, struct inode *inode, const char *full_path, umode_t mode, dev_t dev) { struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx; - int rc = -EOPNOTSUPP; + + if (S_ISSOCK(mode) && !ctx->nonativesocket && ctx->reparse_type != CIFS_REPARSE_TYPE_NONE) + return create_native_socket(xid, inode, dentry, tcon, full_path); switch (ctx->reparse_type) { case CIFS_REPARSE_TYPE_NFS: - rc = mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev); - break; + return mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev, NULL); case CIFS_REPARSE_TYPE_WSL: - rc = mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev); - break; + return mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev, NULL); + default: + return -EOPNOTSUPP; } - return rc; } /* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */ -static int parse_reparse_posix(struct reparse_posix_data *buf, +static int parse_reparse_nfs(struct reparse_nfs_data_buffer *buf, struct cifs_sb_info *cifs_sb, struct cifs_open_info_data *data) { @@ -536,43 +778,160 @@ static int parse_reparse_posix(struct reparse_posix_data *buf, } int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, - bool unicode, bool relative, + bool relative, const char *full_path, struct cifs_sb_info *cifs_sb) { char sep = CIFS_DIR_SEP(cifs_sb); char *linux_target = NULL; char *smb_target = NULL; + int symlinkroot_len; + int abs_path_len; + char *abs_path; int levels; int rc; int i; - /* Check that length it valid for unicode/non-unicode mode */ - if (!len || (unicode && (len % 2))) { + /* Check that length it valid */ + if (!len || (len % 2)) { cifs_dbg(VFS, "srv returned malformed symlink buffer\n"); rc = -EIO; goto out; } /* - * Check that buffer does not contain UTF-16 null codepoint in unicode - * mode or null byte in non-unicode mode because Linux cannot process - * symlink with null byte. + * Check that buffer does not contain UTF-16 null codepoint + * because Linux cannot process symlink with null byte. */ - if ((unicode && UniStrnlen((wchar_t *)buf, len/2) != len/2) || - (!unicode && strnlen(buf, len) != len)) { + if (UniStrnlen((wchar_t *)buf, len/2) != len/2) { cifs_dbg(VFS, "srv returned null byte in native symlink target location\n"); rc = -EIO; goto out; } - smb_target = cifs_strndup_from_utf16(buf, len, unicode, cifs_sb->local_nls); + smb_target = cifs_strndup_from_utf16(buf, len, true, cifs_sb->local_nls); if (!smb_target) { rc = -ENOMEM; goto out; } - if (smb_target[0] == sep && relative) { + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && !relative) { + /* + * This is an absolute symlink from the server which does not + * support POSIX paths, so the symlink is in NT-style path. + * So convert it to absolute Linux symlink target path. Root of + * the NT-style path for symlinks is specified in "symlinkroot" + * mount option. + * + * Root of the DOS and Win32 paths is at NT path \??\ + * It means that DOS/Win32 path C:\folder\file.txt is + * NT path \??\C:\folder\file.txt + * + * NT systems have some well-known object symlinks in their NT + * hierarchy, which is needed to take into account when resolving + * other symlinks. Most commonly used symlink paths are: + * \?? -> \GLOBAL?? + * \DosDevices -> \?? + * \GLOBAL??\GLOBALROOT -> \ + * \GLOBAL??\Global -> \GLOBAL?? + * \GLOBAL??\NUL -> \Device\Null + * \GLOBAL??\UNC -> \Device\Mup + * \GLOBAL??\PhysicalDrive0 -> \Device\Harddisk0\DR0 (for each harddisk) + * \GLOBAL??\A: -> \Device\Floppy0 (if A: is the first floppy) + * \GLOBAL??\C: -> \Device\HarddiskVolume1 (if C: is the first harddisk) + * \GLOBAL??\D: -> \Device\CdRom0 (if D: is first cdrom) + * \SystemRoot -> \Device\Harddisk0\Partition1\WINDOWS (or where is NT system installed) + * \Volume{...} -> \Device\HarddiskVolume1 (where ... is system generated guid) + * + * In most common cases, absolute NT symlinks points to path on + * DOS/Win32 drive letter, system-specific Volume or on UNC share. + * Here are few examples of commonly used absolute NT symlinks + * created by mklink.exe tool: + * \??\C:\folder\file.txt + * \??\\C:\folder\file.txt + * \??\UNC\server\share\file.txt + * \??\\UNC\server\share\file.txt + * \??\Volume{b75e2c83-0000-0000-0000-602f00000000}\folder\file.txt + * + * It means that the most common path prefix \??\ is also NT path + * symlink (to \GLOBAL??). It is less common that second path + * separator is double backslash, but it is valid. + * + * Volume guid is randomly generated by the target system and so + * only the target system knows the mapping between guid and the + * hardisk number. Over SMB it is not possible to resolve this + * mapping, therefore symlinks pointing to target location of + * volume guids are totally unusable over SMB. + * + * For now parse only symlink paths available for DOS and Win32. + * Those are paths with \??\ prefix or paths which points to \??\ + * via other NT symlink (\DosDevices\, \GLOBAL??\, ...). + */ + abs_path = smb_target; +globalroot: + if (strstarts(abs_path, "\\??\\")) + abs_path += sizeof("\\??\\")-1; + else if (strstarts(abs_path, "\\DosDevices\\")) + abs_path += sizeof("\\DosDevices\\")-1; + else if (strstarts(abs_path, "\\GLOBAL??\\")) + abs_path += sizeof("\\GLOBAL??\\")-1; + else { + /* Unhandled absolute symlink, points outside of DOS/Win32 */ + cifs_dbg(VFS, + "absolute symlink '%s' cannot be converted from NT format " + "because points to unknown target\n", + smb_target); + rc = -EIO; + goto out; + } + + /* Sometimes path separator after \?? is double backslash */ + if (abs_path[0] == '\\') + abs_path++; + + while (strstarts(abs_path, "Global\\")) + abs_path += sizeof("Global\\")-1; + + if (strstarts(abs_path, "GLOBALROOT\\")) { + /* Label globalroot requires path with leading '\\', so do not trim '\\' */ + abs_path += sizeof("GLOBALROOT")-1; + goto globalroot; + } + + /* For now parse only paths to drive letters */ + if (((abs_path[0] >= 'A' && abs_path[0] <= 'Z') || + (abs_path[0] >= 'a' && abs_path[0] <= 'z')) && + abs_path[1] == ':' && + (abs_path[2] == '\\' || abs_path[2] == '\0')) { + /* Convert drive letter to lowercase and drop colon */ + char drive_letter = abs_path[0]; + if (drive_letter >= 'A' && drive_letter <= 'Z') + drive_letter += 'a'-'A'; + abs_path++; + abs_path[0] = drive_letter; + } else { + /* Unhandled absolute symlink. Report an error. */ + cifs_dbg(VFS, + "absolute symlink '%s' cannot be converted from NT format " + "because points to unknown target\n", + smb_target); + rc = -EIO; + goto out; + } + + abs_path_len = strlen(abs_path)+1; + symlinkroot_len = strlen(cifs_sb->ctx->symlinkroot); + if (cifs_sb->ctx->symlinkroot[symlinkroot_len-1] == '/') + symlinkroot_len--; + linux_target = kmalloc(symlinkroot_len + 1 + abs_path_len, GFP_KERNEL); + if (!linux_target) { + rc = -ENOMEM; + goto out; + } + memcpy(linux_target, cifs_sb->ctx->symlinkroot, symlinkroot_len); + linux_target[symlinkroot_len] = '/'; + memcpy(linux_target + symlinkroot_len + 1, abs_path, abs_path_len); + } else if (smb_target[0] == sep && relative) { /* * This is a relative SMB symlink from the top of the share, * which is the top level directory of the Linux mount point. @@ -601,6 +960,12 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, } memcpy(linux_target + levels*3, smb_target+1, smb_target_len); /* +1 to skip leading sep */ } else { + /* + * This is either an absolute symlink in POSIX-style format + * or relative SMB symlink from the current directory. + * These paths have same format as Linux symlinks, so no + * conversion is needed. + */ linux_target = smb_target; smb_target = NULL; } @@ -620,8 +985,8 @@ out: return rc; } -static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym, - u32 plen, bool unicode, +static int parse_reparse_native_symlink(struct reparse_symlink_data_buffer *sym, + u32 plen, struct cifs_sb_info *cifs_sb, const char *full_path, struct cifs_open_info_data *data) @@ -641,7 +1006,6 @@ static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym, return smb2_parse_native_symlink(&data->symlink_target, sym->PathBuffer + offs, len, - unicode, le32_to_cpu(sym->Flags) & SYMLINK_FLAG_RELATIVE, full_path, cifs_sb); @@ -696,7 +1060,7 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, const char *full_path, - bool unicode, struct cifs_open_info_data *data) + struct cifs_open_info_data *data) { struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -705,12 +1069,12 @@ int parse_reparse_point(struct reparse_data_buffer *buf, /* See MS-FSCC 2.1.2 */ switch (le32_to_cpu(buf->ReparseTag)) { case IO_REPARSE_TAG_NFS: - return parse_reparse_posix((struct reparse_posix_data *)buf, + return parse_reparse_nfs((struct reparse_nfs_data_buffer *)buf, cifs_sb, data); case IO_REPARSE_TAG_SYMLINK: - return parse_reparse_symlink( + return parse_reparse_native_symlink( (struct reparse_symlink_data_buffer *)buf, - plen, unicode, cifs_sb, full_path, data); + plen, cifs_sb, full_path, data); case IO_REPARSE_TAG_LX_SYMLINK: return parse_reparse_wsl_symlink( (struct reparse_wsl_symlink_data_buffer *)buf, @@ -724,13 +1088,12 @@ int parse_reparse_point(struct reparse_data_buffer *buf, le32_to_cpu(buf->ReparseTag)); return -EIO; } - break; + return 0; default: cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", le32_to_cpu(buf->ReparseTag)); - break; + return -EOPNOTSUPP; } - return 0; } int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, @@ -744,14 +1107,15 @@ int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, buf = (struct reparse_data_buffer *)((u8 *)io + le32_to_cpu(io->OutputOffset)); - return parse_reparse_point(buf, plen, cifs_sb, full_path, true, data); + return parse_reparse_point(buf, plen, cifs_sb, full_path, data); } -static void wsl_to_fattr(struct cifs_open_info_data *data, +static bool wsl_to_fattr(struct cifs_open_info_data *data, struct cifs_sb_info *cifs_sb, u32 tag, struct cifs_fattr *fattr) { struct smb2_file_full_ea_info *ea; + bool have_xattr_dev = false; u32 next = 0; switch (tag) { @@ -794,21 +1158,31 @@ static void wsl_to_fattr(struct cifs_open_info_data *data, fattr->cf_uid = wsl_make_kuid(cifs_sb, v); else if (!strncmp(name, SMB2_WSL_XATTR_GID, nlen)) fattr->cf_gid = wsl_make_kgid(cifs_sb, v); - else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen)) + else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen)) { + /* File type in reparse point tag and in xattr mode must match. */ + if (S_DT(fattr->cf_mode) != S_DT(le32_to_cpu(*(__le32 *)v))) + return false; fattr->cf_mode = (umode_t)le32_to_cpu(*(__le32 *)v); - else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen)) + } else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen)) { fattr->cf_rdev = reparse_mkdev(v); + have_xattr_dev = true; + } } while (next); out: + + /* Major and minor numbers for char and block devices are mandatory. */ + if (!have_xattr_dev && (tag == IO_REPARSE_TAG_LX_CHR || tag == IO_REPARSE_TAG_LX_BLK)) + return false; + fattr->cf_dtype = S_DT(fattr->cf_mode); + return true; } static bool posix_reparse_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct cifs_open_info_data *data) { - struct reparse_posix_data *buf = data->reparse.posix; - + struct reparse_nfs_data_buffer *buf = (struct reparse_nfs_data_buffer *)data->reparse.buf; if (buf == NULL) return true; @@ -874,7 +1248,9 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, case IO_REPARSE_TAG_AF_UNIX: case IO_REPARSE_TAG_LX_CHR: case IO_REPARSE_TAG_LX_BLK: - wsl_to_fattr(data, cifs_sb, tag, fattr); + ok = wsl_to_fattr(data, cifs_sb, tag, fattr); + if (!ok) + return false; break; case IO_REPARSE_TAG_NFS: ok = posix_reparse_to_fattr(cifs_sb, fattr, data); diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index ff05b0e75c92..c0be5ab45a78 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -50,6 +50,7 @@ static inline kgid_t wsl_make_kgid(struct cifs_sb_info *cifs_sb, static inline u64 reparse_mode_nfs_type(mode_t mode) { switch (mode & S_IFMT) { + case S_IFLNK: return NFS_SPECFILE_LNK; case S_IFBLK: return NFS_SPECFILE_BLK; case S_IFCHR: return NFS_SPECFILE_CHR; case S_IFIFO: return NFS_SPECFILE_FIFO; @@ -61,6 +62,7 @@ static inline u64 reparse_mode_nfs_type(mode_t mode) static inline u32 reparse_mode_wsl_tag(mode_t mode) { switch (mode & S_IFMT) { + case S_IFLNK: return IO_REPARSE_TAG_LX_SYMLINK; case S_IFBLK: return IO_REPARSE_TAG_LX_BLK; case S_IFCHR: return IO_REPARSE_TAG_LX_CHR; case S_IFIFO: return IO_REPARSE_TAG_LX_FIFO; @@ -97,14 +99,30 @@ static inline bool reparse_inode_match(struct inode *inode, static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data) { - struct smb2_file_all_info *fi = &data->fi; - u32 attrs = le32_to_cpu(fi->Attributes); + u32 attrs; bool ret; - ret = data->reparse_point || (attrs & ATTR_REPARSE); - if (ret) - attrs |= ATTR_REPARSE; - fi->Attributes = cpu_to_le32(attrs); + if (data->contains_posix_file_info) { + struct smb311_posix_qinfo *fi = &data->posix_fi; + + attrs = le32_to_cpu(fi->DosAttributes); + if (data->reparse_point) { + attrs |= ATTR_REPARSE; + fi->DosAttributes = cpu_to_le32(attrs); + } + + } else { + struct smb2_file_all_info *fi = &data->fi; + + attrs = le32_to_cpu(fi->Attributes); + if (data->reparse_point) { + attrs |= ATTR_REPARSE; + fi->Attributes = cpu_to_le32(attrs); + } + } + + ret = attrs & ATTR_REPARSE; + return ret; } diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 91d4d409cb1d..faa80e7d54a6 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -1235,12 +1235,13 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) switch (requested) { case Kerberos: case RawNTLMSSP: + case IAKerb: return requested; case Unspecified: if (server->sec_ntlmssp && (global_secflags & CIFSSEC_MAY_NTLMSSP)) return RawNTLMSSP; - if ((server->sec_kerberos || server->sec_mskerberos) && + if ((server->sec_kerberos || server->sec_mskerberos || server->sec_iakerb) && (global_secflags & CIFSSEC_MAY_KRB5)) return Kerberos; fallthrough; diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 749a83cd0deb..d6e2fb669c40 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -377,7 +377,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) static void cifs_downgrade_oplock(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { cifs_set_oplock_level(cinode, oplock); } @@ -551,7 +551,7 @@ static int cifs_query_path_info(const unsigned int xid, int rc; FILE_ALL_INFO fi = {}; - data->symlink = false; + data->reparse_point = false; data->adjust_tz = false; /* could do find first instead but this returns more info */ @@ -569,32 +569,8 @@ static int cifs_query_path_info(const unsigned int xid, } if (!rc) { - int tmprc; - int oplock = 0; - struct cifs_fid fid; - struct cifs_open_parms oparms; - move_cifs_info_to_smb2(&data->fi, &fi); - - if (!(le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) - return 0; - - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .cifs_sb = cifs_sb, - .desired_access = FILE_READ_ATTRIBUTES, - .create_options = cifs_create_options(cifs_sb, 0), - .disposition = FILE_OPEN, - .path = full_path, - .fid = &fid, - }; - - /* Need to check if this is a symbolic link or not */ - tmprc = CIFS_open(xid, &oparms, &oplock, NULL); - if (tmprc == -EOPNOTSUPP) - data->symlink = true; - else if (tmprc == 0) - CIFSSMBClose(xid, tcon, fid.netfid); + data->reparse_point = le32_to_cpu(fi.Attributes) & ATTR_REPARSE; } return rc; @@ -1010,7 +986,7 @@ static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb, buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + le32_to_cpu(io->DataOffset)); - return parse_reparse_point(buf, plen, cifs_sb, full_path, true, data); + return parse_reparse_point(buf, plen, cifs_sb, full_path, data); } static bool diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index 9ec44eab8dbc..d609a20fb98a 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -63,6 +63,52 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) return sym; } +int smb2_fix_symlink_target_type(char **target, bool directory, struct cifs_sb_info *cifs_sb) +{ + char *buf; + int len; + + /* + * POSIX server does not distinguish between symlinks to file and + * symlink directory. So nothing is needed to fix on the client side. + */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) + return 0; + + if (!*target) + return -EIO; + + len = strlen(*target); + if (!len) + return -EIO; + + /* + * If this is directory symlink and it does not have trailing slash then + * append it. Trailing slash simulates Windows/SMB behavior which do not + * allow resolving directory symlink to file. + */ + if (directory && (*target)[len-1] != '/') { + buf = krealloc(*target, len+2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + buf[len] = '/'; + buf[len+1] = '\0'; + *target = buf; + len++; + } + + /* + * If this is a file (non-directory) symlink and it points to path name + * with trailing slash then this is an invalid symlink because file name + * cannot contain slash character. File name with slash is invalid on + * both Windows and Linux systems. So return an error for such symlink. + */ + if (!directory && (*target)[len-1] == '/') + return -EIO; + + return 0; +} + int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, const char *full_path, char **path) { @@ -89,7 +135,6 @@ int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec return smb2_parse_native_symlink(path, (char *)sym->PathBuffer + sub_offs, sub_len, - true, le32_to_cpu(sym->Flags) & SYMLINK_FLAG_RELATIVE, full_path, cifs_sb); @@ -133,6 +178,11 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 NULL, NULL, NULL); oparms->create_options &= ~OPEN_REPARSE_POINT; } + if (!rc) { + bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY; + rc = smb2_fix_symlink_target_type(&data->symlink_target, + directory, oparms->cifs_sb); + } } } diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index c97f14757c27..826b57a5a2a8 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -650,6 +650,7 @@ finished: switch (cmds[i]) { case SMB2_OP_QUERY_INFO: idata = in_iov[i].iov_base; + idata->contains_posix_file_info = false; if (rc == 0 && cfile && cfile->symlink_target) { idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!idata->symlink_target) @@ -673,6 +674,7 @@ finished: break; case SMB2_OP_POSIX_QUERY_INFO: idata = in_iov[i].iov_base; + idata->contains_posix_file_info = true; if (rc == 0 && cfile && cfile->symlink_target) { idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!idata->symlink_target) @@ -770,6 +772,7 @@ finished: idata = in_iov[i].iov_base; idata->reparse.io.iov = *iov; idata->reparse.io.buftype = resp_buftype[i + 1]; + idata->contains_posix_file_info = false; /* BB VERIFY */ rbuf = reparse_buf_ptr(iov); if (IS_ERR(rbuf)) { rc = PTR_ERR(rbuf); @@ -791,6 +794,7 @@ finished: case SMB2_OP_QUERY_WSL_EA: if (!rc) { idata = in_iov[i].iov_base; + idata->contains_posix_file_info = false; qi_rsp = rsp_iov[i + 1].iov_base; data[0] = (u8 *)qi_rsp + le16_to_cpu(qi_rsp->OutputBufferOffset); size[0] = le32_to_cpu(qi_rsp->OutputBufferLength); @@ -1010,6 +1014,11 @@ int smb2_query_path_info(const unsigned int xid, else rc = -EOPNOTSUPP; } + + if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) { + bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY; + rc = smb2_fix_symlink_target_type(&data->symlink_target, directory, cifs_sb); + } break; case -EREMOTE: break; diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c index b05313acf9b2..12c2b868789f 100644 --- a/fs/smb/client/smb2maperror.c +++ b/fs/smb/client/smb2maperror.c @@ -380,7 +380,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_NO_LOGON_SERVERS, -EIO, "STATUS_NO_LOGON_SERVERS"}, {STATUS_NO_SUCH_LOGON_SESSION, -EIO, "STATUS_NO_SUCH_LOGON_SESSION"}, {STATUS_NO_SUCH_PRIVILEGE, -EIO, "STATUS_NO_SUCH_PRIVILEGE"}, - {STATUS_PRIVILEGE_NOT_HELD, -EIO, "STATUS_PRIVILEGE_NOT_HELD"}, + {STATUS_PRIVILEGE_NOT_HELD, -EPERM, "STATUS_PRIVILEGE_NOT_HELD"}, {STATUS_INVALID_ACCOUNT_NAME, -EIO, "STATUS_INVALID_ACCOUNT_NAME"}, {STATUS_USER_EXISTS, -EIO, "STATUS_USER_EXISTS"}, {STATUS_NO_SUCH_USER, -EIO, "STATUS_NO_SUCH_USER"}, @@ -871,7 +871,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_VALIDATE_CONTINUE, -EIO, "STATUS_VALIDATE_CONTINUE"}, {STATUS_NO_MATCH, -EIO, "STATUS_NO_MATCH"}, {STATUS_NO_MORE_MATCHES, -EIO, "STATUS_NO_MORE_MATCHES"}, - {STATUS_NOT_A_REPARSE_POINT, -EIO, "STATUS_NOT_A_REPARSE_POINT"}, + {STATUS_NOT_A_REPARSE_POINT, -ENODATA, "STATUS_NOT_A_REPARSE_POINT"}, {STATUS_IO_REPARSE_TAG_INVALID, -EIO, "STATUS_IO_REPARSE_TAG_INVALID"}, {STATUS_IO_REPARSE_TAG_MISMATCH, -EIO, "STATUS_IO_REPARSE_TAG_MISMATCH"}, diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index d640dcabc305..23e0c8be7fb5 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1001,6 +1001,7 @@ static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, if (!data->symlink_target) return -ENOMEM; } + data->contains_posix_file_info = false; return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi); } @@ -3904,22 +3905,22 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, static void smb2_downgrade_oplock(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { server->ops->set_oplock_level(cinode, oplock, 0, NULL); } static void smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache); + __u16 epoch, bool *purge_cache); static void smb3_downgrade_oplock(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { unsigned int old_state = cinode->oplock; - unsigned int old_epoch = cinode->epoch; + __u16 old_epoch = cinode->epoch; unsigned int new_state; if (epoch > old_epoch) { @@ -3939,7 +3940,7 @@ smb3_downgrade_oplock(struct TCP_Server_Info *server, static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { oplock &= 0xFF; cinode->lease_granted = false; @@ -3963,7 +3964,7 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, static void smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { char message[5] = {0}; unsigned int new_oplock = 0; @@ -4000,7 +4001,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, static void smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { unsigned int old_oplock = cinode->oplock; @@ -4114,7 +4115,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) } static __u8 -smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) +smb2_parse_lease_buf(void *buf, __u16 *epoch, char *lease_key) { struct create_lease *lc = (struct create_lease *)buf; @@ -4125,7 +4126,7 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) } static __u8 -smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) +smb3_parse_lease_buf(void *buf, __u16 *epoch, char *lease_key) { struct create_lease_v2 *lc = (struct create_lease_v2 *)buf; @@ -5077,6 +5078,7 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, { struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; + struct cifs_open_info_data idata; struct cifs_io_parms io_parms = {}; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_fid fid; @@ -5145,11 +5147,21 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, FILE_CREATE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, ACL_NO_MODE); oparms.fid = &fid; - - rc = server->ops->open(xid, &oparms, &oplock, NULL); + idata.contains_posix_file_info = false; + rc = server->ops->open(xid, &oparms, &oplock, &idata); if (rc) goto out; + /* + * Check if the server honored ATTR_SYSTEM flag by CREATE_OPTION_SPECIAL + * option. If not then server does not support ATTR_SYSTEM and newly + * created file is not SFU compatible, which means that the call failed. + */ + if (!(le32_to_cpu(idata.fi.Attributes) & ATTR_SYSTEM)) { + rc = -EOPNOTSUPP; + goto out_close; + } + if (type_len + data_len > 0) { io_parms.pid = current->tgid; io_parms.tcon = tcon; @@ -5164,8 +5176,18 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, iov, ARRAY_SIZE(iov)-1); } +out_close: server->ops->close(xid, tcon, &fid); + /* + * If CREATE was successful but either setting ATTR_SYSTEM failed or + * writing type/data information failed then remove the intermediate + * object created by CREATE. Otherwise intermediate empty object stay + * on the server. + */ + if (rc) + server->ops->unlink(xid, tcon, full_path, cifs_sb, NULL); + out: kfree(symname_utf16); return rc; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 9f54596a6866..ed7812247ebc 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1429,7 +1429,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) if (server->sec_ntlmssp && (global_secflags & CIFSSEC_MAY_NTLMSSP)) return RawNTLMSSP; - if ((server->sec_kerberos || server->sec_mskerberos) && + if ((server->sec_kerberos || server->sec_mskerberos || server->sec_iakerb) && (global_secflags & CIFSSEC_MAY_KRB5)) return Kerberos; fallthrough; @@ -2169,7 +2169,7 @@ tcon_exit: tcon_error_exit: if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) - cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); + cifs_dbg(VFS | ONCE, "BAD_NETWORK_NAME: %s\n", tree); goto tcon_exit; } @@ -2329,7 +2329,7 @@ parse_posix_ctxt(struct create_context *cc, struct smb2_file_all_info *info, int smb2_parse_contexts(struct TCP_Server_Info *server, struct kvec *rsp_iov, - unsigned int *epoch, + __u16 *epoch, char *lease_key, __u8 *oplock, struct smb2_file_all_info *buf, struct create_posix_rsp *posix) diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 09349fa8da03..4662c7e2d259 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -111,8 +111,9 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_read); +int smb2_fix_symlink_target_type(char **target, bool directory, struct cifs_sb_info *cifs_sb); int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, - bool unicode, bool relative, + bool relative, const char *full_path, struct cifs_sb_info *cifs_sb); int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, @@ -282,7 +283,7 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, enum securityEnum); int smb2_parse_contexts(struct TCP_Server_Info *server, struct kvec *rsp_iov, - unsigned int *epoch, + __u16 *epoch, char *lease_key, __u8 *oplock, struct smb2_file_all_info *buf, struct create_posix_rsp *posix); diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 3c7c706c797d..c7a0efda4403 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1550,7 +1550,19 @@ struct reparse_symlink_data_buffer { __u8 PathBuffer[]; /* Variable Length */ } __packed; -/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */ +/* For IO_REPARSE_TAG_NFS - see MS-FSCC 2.1.2.6 */ +#define NFS_SPECFILE_LNK 0x00000000014B4E4C +#define NFS_SPECFILE_CHR 0x0000000000524843 +#define NFS_SPECFILE_BLK 0x00000000004B4C42 +#define NFS_SPECFILE_FIFO 0x000000004F464946 +#define NFS_SPECFILE_SOCK 0x000000004B434F53 +struct reparse_nfs_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le64 InodeType; /* NFS_SPECFILE_* */ + __u8 DataBuffer[]; +} __packed; /* For IO_REPARSE_TAG_LX_SYMLINK */ struct reparse_wsl_symlink_data_buffer { @@ -1695,23 +1707,33 @@ struct smb2_file_internal_info { } __packed; /* level 6 Query */ struct smb2_file_rename_info { /* encoding of request for level 10 */ - __u8 ReplaceIfExists; /* 1 = replace existing target with new */ - /* 0 = fail if target already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; + /* New members MUST be added within the struct_group() macro below. */ + __struct_group(smb2_file_rename_info_hdr, __hdr, __packed, + __u8 ReplaceIfExists; /* 1 = replace existing target with new */ + /* 0 = fail if target already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + ); char FileName[]; /* New name to be assigned */ /* padding - overall struct size must be >= 24 so filename + pad >= 6 */ } __packed; /* level 10 Set */ +static_assert(offsetof(struct smb2_file_rename_info, FileName) == sizeof(struct smb2_file_rename_info_hdr), + "struct member likely outside of __struct_group()"); struct smb2_file_link_info { /* encoding of request for level 11 */ - __u8 ReplaceIfExists; /* 1 = replace existing link with new */ - /* 0 = fail if link already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; + /* New members MUST be added within the struct_group() macro below. */ + __struct_group(smb2_file_link_info_hdr, __hdr, __packed, + __u8 ReplaceIfExists; /* 1 = replace existing link with new */ + /* 0 = fail if link already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + ); char FileName[]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ +static_assert(offsetof(struct smb2_file_link_info, FileName) == sizeof(struct smb2_file_link_info_hdr), + "struct member likely outside of __struct_group()"); /* * This level 18, although with struct with same name is different from cifs diff --git a/fs/smb/common/smbfsctl.h b/fs/smb/common/smbfsctl.h index 4b379e84c46b..3253a18ecb5c 100644 --- a/fs/smb/common/smbfsctl.h +++ b/fs/smb/common/smbfsctl.h @@ -159,6 +159,9 @@ #define IO_REPARSE_TAG_LX_CHR 0x80000025 #define IO_REPARSE_TAG_LX_BLK 0x80000026 +/* If Name Surrogate Bit is set, the file or directory represents another named entity in the system. */ +#define IS_REPARSE_TAG_NAME_SURROGATE(tag) (!!((tag) & 0x20000000)) + /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ #define SMB2_0_IOCTL_IS_FSCTL 0x00000001 diff --git a/fs/stat.c b/fs/stat.c index 2c0e111a098a..f13308bfdc98 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -281,6 +281,8 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, u32 request_mask) { int error = vfs_getattr(path, stat, request_mask, flags); + if (error) + return error; if (request_mask & STATX_MNT_ID_UNIQUE) { stat->mnt_id = real_mount(path->mnt)->mnt_id_unique; @@ -302,7 +304,7 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, if (S_ISBLK(stat->mode)) bdev_statx(path, stat, request_mask); - return error; + return 0; } static int vfs_statx_fd(int fd, int flags, struct kstat *stat, diff --git a/fs/sysctls.c b/fs/sysctls.c index 8dbde9a802fa..ad429dffeb4b 100644 --- a/fs/sysctls.c +++ b/fs/sysctls.c @@ -7,7 +7,7 @@ #include <linux/init.h> #include <linux/sysctl.h> -static struct ctl_table fs_shared_sysctls[] = { +static const struct ctl_table fs_shared_sysctls[] = { { .procname = "overflowuid", .data = &fs_overflowuid, diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index cfc614c638da..53214499e384 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -457,7 +457,8 @@ static void tracefs_d_release(struct dentry *dentry) eventfs_d_release(dentry); } -static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags) +static int tracefs_d_revalidate(struct inode *inode, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct eventfs_inode *ei = dentry->d_fsdata; diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 5cc69beaa62e..b01f382ce8db 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -863,7 +863,6 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) out: vfree(buf); - return; } void ubifs_dump_znode(const struct ubifs_info *c, @@ -946,16 +945,20 @@ void ubifs_dump_tnc(struct ubifs_info *c) pr_err("\n"); pr_err("(pid %d) start dumping TNC tree\n", current->pid); - znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, NULL); - level = znode->level; - pr_err("== Level %d ==\n", level); - while (znode) { - if (level != znode->level) { - level = znode->level; - pr_err("== Level %d ==\n", level); + if (c->zroot.znode) { + znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, NULL); + level = znode->level; + pr_err("== Level %d ==\n", level); + while (znode) { + if (level != znode->level) { + level = znode->level; + pr_err("== Level %d ==\n", level); + } + ubifs_dump_znode(c, znode); + znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, znode); } - ubifs_dump_znode(c, znode); - znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, znode); + } else { + pr_err("empty TNC tree in memory\n"); } pr_err("(pid %d) finish dumping TNC tree\n", current->pid); } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index aa8837e6247c..f2cb214581fd 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1932,7 +1932,6 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum); out: vfree(buf); - return; } /** diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7c0bd0b55f88..97c4d71115d8 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -36,7 +36,7 @@ static int sysctl_unprivileged_userfaultfd __read_mostly; #ifdef CONFIG_SYSCTL -static struct ctl_table vm_userfaultfd_table[] = { +static const struct ctl_table vm_userfaultfd_table[] = { { .procname = "unprivileged_userfaultfd", .data = &sysctl_unprivileged_userfaultfd, diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index 5f1a14d5b927..a859ac9b74ba 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -192,7 +192,8 @@ const struct file_operations vboxsf_dir_fops = { * This is called during name resolution/lookup to check if the @dentry in * the cache is still valid. the job is handled by vboxsf_inode_revalidate. */ -static int vboxsf_dentry_revalidate(struct dentry *dentry, unsigned int flags) +static int vboxsf_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index e95b8a48d8a0..1d94bb784108 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -21,7 +21,8 @@ #define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */ -static const unsigned char VBSF_MOUNT_SIGNATURE[4] = "\000\377\376\375"; +static const unsigned char VBSF_MOUNT_SIGNATURE[4] = { '\000', '\377', '\376', + '\375' }; static int follow_symlinks; module_param(follow_symlinks, int, 0444); diff --git a/fs/verity/init.c b/fs/verity/init.c index f440f0e61e3e..6e8d33b50240 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -10,7 +10,7 @@ #include <linux/ratelimit.h> #ifdef CONFIG_SYSCTL -static struct ctl_table fsverity_sysctl_table[] = { +static const struct ctl_table fsverity_sysctl_table[] = { #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES { .procname = "require_signatures", diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 40ad22fb808b..0ef19f1469ec 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3563,12 +3563,12 @@ xfs_bmap_btalloc_at_eof( int error; /* - * If there are already extents in the file, try an exact EOF block - * allocation to extend the file as a contiguous extent. If that fails, - * or it's the first allocation in a file, just try for a stripe aligned - * allocation. + * If there are already extents in the file, and xfs_bmap_adjacent() has + * given a better blkno, try an exact EOF block allocation to extend the + * file as a contiguous extent. If that fails, or it's the first + * allocation in a file, just try for a stripe aligned allocation. */ - if (ap->offset) { + if (ap->eof) { xfs_extlen_t nextminlen = 0; /* @@ -3736,7 +3736,8 @@ xfs_bmap_btalloc_best_length( int error; ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino); - xfs_bmap_adjacent(ap); + if (!xfs_bmap_adjacent(ap)) + ap->eof = false; /* * Search for an allocation group with a single extent large enough for diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index bdcd40f0ec74..19877d99f255 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -224,7 +224,6 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) bool xchk_dir_looks_zapped(struct xfs_inode *dp); bool xchk_pptr_looks_zapped(struct xfs_inode *ip); -#ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) { @@ -244,10 +243,6 @@ static inline bool xchk_could_repair(const struct xfs_scrub *sc) return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !(sc->flags & XREP_ALREADY_FIXED); } -#else -# define xchk_needs_repair(sc) (false) -# define xchk_could_repair(sc) (false) -#endif /* CONFIG_XFS_ONLINE_REPAIR */ int xchk_metadata_inode_forks(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 2f641b6d663e..13ff1c933cb8 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -1055,9 +1055,17 @@ xrep_dinode_check_dfork( return true; break; case S_IFREG: - if (fmt == XFS_DINODE_FMT_LOCAL) + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: return true; - fallthrough; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_META_BTREE: + break; + default: + return true; + } + break; case S_IFLNK: case S_IFDIR: switch (fmt) { diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 823c00d1a502..af0a3a9e5ed9 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -191,7 +191,16 @@ int xrep_reset_metafile_resv(struct xfs_scrub *sc); #else #define xrep_ino_dqattach(sc) (0) -#define xrep_will_attempt(sc) (false) + +/* + * When online repair is not built into the kernel, we still want to attempt + * the repair so that the stub xrep_attempt below will return EOPNOTSUPP. + */ +static inline bool xrep_will_attempt(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + xchk_needs_repair(sc->sm); +} static inline int xrep_attempt( diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 7567dd5cad14..6fa9e3e5bab7 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -149,6 +149,18 @@ xchk_probe( if (xchk_should_terminate(sc, &error)) return error; + /* + * If the caller is probing to see if repair works but repair isn't + * built into the kernel, return EOPNOTSUPP because that's the signal + * that userspace expects. If online repair is built in, set the + * CORRUPT flag (without any of the usual tracing/logging) to force us + * into xrep_probe. + */ + if (xchk_could_repair(sc)) { + if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)) + return -EOPNOTSUPP; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + } return 0; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 67877c36ed11..6d9965b546cb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -19,6 +19,7 @@ #include "xfs_reflink.h" #include "xfs_errortag.h" #include "xfs_error.h" +#include "xfs_icache.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -528,12 +529,44 @@ xfs_vm_readahead( } static int -xfs_iomap_swapfile_activate( +xfs_vm_swap_activate( struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { - sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev; + struct xfs_inode *ip = XFS_I(file_inode(swap_file)); + + /* + * Swap file activation can race against concurrent shared extent + * removal in files that have been cloned. If this happens, + * iomap_swapfile_iter() can fail because it encountered a shared + * extent even though an operation is in progress to remove those + * shared extents. + * + * This race becomes problematic when we defer extent removal + * operations beyond the end of a syscall (i.e. use async background + * processing algorithms). Users think the extents are no longer + * shared, but iomap_swapfile_iter() still sees them as shared + * because the refcountbt entries for the extents being removed have + * not yet been updated. Hence the swapon call fails unexpectedly. + * + * The race condition is currently most obvious from the unlink() + * operation as extent removal is deferred until after the last + * reference to the inode goes away. We then process the extent + * removal asynchronously, hence triggers the "syscall completed but + * work not done" condition mentioned above. To close this race + * window, we need to flush any pending inodegc operations to ensure + * they have updated the refcountbt records before we try to map the + * swapfile. + */ + xfs_inodegc_flush(ip->i_mount); + + /* + * Direct the swap code to the correct block device when this file + * sits on the RT device. + */ + sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; + return iomap_swapfile_activate(sis, swap_file, span, &xfs_read_iomap_ops); } @@ -549,11 +582,11 @@ const struct address_space_operations xfs_address_space_operations = { .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = xfs_iomap_swapfile_activate, + .swap_activate = xfs_vm_swap_activate, }; const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .dirty_folio = noop_dirty_folio, - .swap_activate = xfs_iomap_swapfile_activate, + .swap_activate = xfs_vm_swap_activate, }; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d1d4a0a22e13..15bb790359f8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -41,8 +41,7 @@ struct kmem_cache *xfs_buf_cache; * * xfs_buf_rele: * b_lock - * pag_buf_lock - * lru_lock + * lru_lock * * xfs_buftarg_drain_rele * lru_lock @@ -220,23 +219,25 @@ _xfs_buf_alloc( */ flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); - spin_lock_init(&bp->b_lock); + /* + * A new buffer is held and locked by the owner. This ensures that the + * buffer is owned by the caller and racing RCU lookups right after + * inserting into the hash table are safe (and will have to wait for + * the unlock to do anything non-trivial). + */ bp->b_hold = 1; + sema_init(&bp->b_sema, 0); /* held, no waiters */ + + spin_lock_init(&bp->b_lock); atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_li_list); - sema_init(&bp->b_sema, 0); /* held, no waiters */ bp->b_target = target; bp->b_mount = target->bt_mount; bp->b_flags = flags; - /* - * Set length and io_length to the same value initially. - * I/O routines should use io_length, which will be the same in - * most cases but may be reset (e.g. XFS recovery). - */ error = xfs_buf_get_maps(bp, nmaps); if (error) { kmem_cache_free(xfs_buf_cache, bp); @@ -502,7 +503,6 @@ int xfs_buf_cache_init( struct xfs_buf_cache *bch) { - spin_lock_init(&bch->bc_lock); return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); } @@ -652,17 +652,20 @@ xfs_buf_find_insert( if (error) goto out_free_buf; - spin_lock(&bch->bc_lock); + /* The new buffer keeps the perag reference until it is freed. */ + new_bp->b_pag = pag; + + rcu_read_lock(); bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); if (IS_ERR(bp)) { + rcu_read_unlock(); error = PTR_ERR(bp); - spin_unlock(&bch->bc_lock); goto out_free_buf; } if (bp && xfs_buf_try_hold(bp)) { /* found an existing buffer */ - spin_unlock(&bch->bc_lock); + rcu_read_unlock(); error = xfs_buf_find_lock(bp, flags); if (error) xfs_buf_rele(bp); @@ -670,10 +673,8 @@ xfs_buf_find_insert( *bpp = bp; goto out_free_buf; } + rcu_read_unlock(); - /* The new buffer keeps the perag reference until it is freed. */ - new_bp->b_pag = pag; - spin_unlock(&bch->bc_lock); *bpp = new_bp; return 0; @@ -1090,7 +1091,6 @@ xfs_buf_rele_cached( } /* we are asked to drop the last reference */ - spin_lock(&bch->bc_lock); __xfs_buf_ioacct_dec(bp); if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { /* @@ -1102,7 +1102,6 @@ xfs_buf_rele_cached( bp->b_state &= ~XFS_BSTATE_DISPOSE; else bp->b_hold--; - spin_unlock(&bch->bc_lock); } else { bp->b_hold--; /* @@ -1120,7 +1119,6 @@ xfs_buf_rele_cached( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, xfs_buf_hash_params); - spin_unlock(&bch->bc_lock); if (pag) xfs_perag_put(pag); freebuf = true; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7e73663c5d4a..3b4ed42e11c0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -80,7 +80,6 @@ typedef unsigned int xfs_buf_flags_t; #define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ struct xfs_buf_cache { - spinlock_t bc_lock; struct rhashtable bc_hash; }; diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c index f340a2015c4c..0b41bdfecdfb 100644 --- a/fs/xfs/xfs_exchrange.c +++ b/fs/xfs/xfs_exchrange.c @@ -329,22 +329,6 @@ out_trans_cancel: * successfully but before locks are dropped. */ -/* Verify that we have security clearance to perform this operation. */ -static int -xfs_exchange_range_verify_area( - struct xfs_exchrange *fxr) -{ - int ret; - - ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length, - true); - if (ret) - return ret; - - return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length, - true); -} - /* * Performs necessary checks before doing a range exchange, having stabilized * mutable inode attributes via i_rwsem. @@ -355,11 +339,13 @@ xfs_exchange_range_checks( unsigned int alloc_unit) { struct inode *inode1 = file_inode(fxr->file1); + loff_t size1 = i_size_read(inode1); struct inode *inode2 = file_inode(fxr->file2); + loff_t size2 = i_size_read(inode2); uint64_t allocmask = alloc_unit - 1; int64_t test_len; uint64_t blen; - loff_t size1, size2, tmp; + loff_t tmp; int error; /* Don't touch certain kinds of inodes */ @@ -368,24 +354,25 @@ xfs_exchange_range_checks( if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) return -ETXTBSY; - size1 = i_size_read(inode1); - size2 = i_size_read(inode2); - /* Ranges cannot start after EOF. */ if (fxr->file1_offset > size1 || fxr->file2_offset > size2) return -EINVAL; - /* - * If the caller said to exchange to EOF, we set the length of the - * request large enough to cover everything to the end of both files. - */ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { + /* + * If the caller said to exchange to EOF, we set the length of + * the request large enough to cover everything to the end of + * both files. + */ fxr->length = max_t(int64_t, size1 - fxr->file1_offset, size2 - fxr->file2_offset); - - error = xfs_exchange_range_verify_area(fxr); - if (error) - return error; + } else { + /* + * Otherwise we require both ranges to end within EOF. + */ + if (fxr->file1_offset + fxr->length > size1 || + fxr->file2_offset + fxr->length > size2) + return -EINVAL; } /* @@ -402,15 +389,6 @@ xfs_exchange_range_checks( return -EINVAL; /* - * We require both ranges to end within EOF, unless we're exchanging - * to EOF. - */ - if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) && - (fxr->file1_offset + fxr->length > size1 || - fxr->file2_offset + fxr->length > size2)) - return -EINVAL; - - /* * Make sure we don't hit any file size limits. If we hit any size * limits such that test_length was adjusted, we abort the whole * operation. @@ -747,6 +725,7 @@ xfs_exchange_range( { struct inode *inode1 = file_inode(fxr->file1); struct inode *inode2 = file_inode(fxr->file2); + loff_t check_len = fxr->length; int ret; BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS & @@ -779,14 +758,18 @@ xfs_exchange_range( return -EBADF; /* - * If we're not exchanging to EOF, we can check the areas before - * stabilizing both files' i_size. + * If we're exchanging to EOF we can't calculate the length until taking + * the iolock. Pass a 0 length to remap_verify_area similar to the + * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well. */ - if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) { - ret = xfs_exchange_range_verify_area(fxr); - if (ret) - return ret; - } + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) + check_len = 0; + ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true); + if (ret) + return ret; + ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true); + if (ret) + return ret; /* Update cmtime if the fd/inode don't forbid it. */ if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c95fe1b1de4e..b1f9f156ec88 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1404,8 +1404,11 @@ xfs_inactive( goto out; /* Try to clean out the cow blocks if there are any. */ - if (xfs_inode_has_cow_data(ip)) - xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (xfs_inode_has_cow_data(ip)) { + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (error) + goto out; + } if (VFS_I(ip)->i_nlink != 0) { /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 50fa3ef89f6c..d61460309a78 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -976,10 +976,8 @@ xfs_dax_write_iomap_end( if (!xfs_is_cow_inode(ip)) return 0; - if (!written) { - xfs_reflink_cancel_cow_range(ip, pos, length, true); - return 0; - } + if (!written) + return xfs_reflink_cancel_cow_range(ip, pos, length, true); return xfs_reflink_end_cow(ip, pos, written); } diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 37f1230e7584..245d754f382a 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -78,6 +78,28 @@ xfs_qm_statvfs( } } +STATIC int +xfs_qm_validate_state_change( + struct xfs_mount *mp, + uint uqd, + uint gqd, + uint pqd) +{ + int state; + + /* Is quota state changing? */ + state = ((uqd && !XFS_IS_UQUOTA_ON(mp)) || + (!uqd && XFS_IS_UQUOTA_ON(mp)) || + (gqd && !XFS_IS_GQUOTA_ON(mp)) || + (!gqd && XFS_IS_GQUOTA_ON(mp)) || + (pqd && !XFS_IS_PQUOTA_ON(mp)) || + (!pqd && XFS_IS_PQUOTA_ON(mp))); + + return state && + (xfs_dev_is_read_only(mp, "changing quota state") || + xfs_has_norecovery(mp)); +} + int xfs_qm_newmount( xfs_mount_t *mp, @@ -97,24 +119,25 @@ xfs_qm_newmount( } /* - * If the device itself is read-only, we can't allow - * the user to change the state of quota on the mount - - * this would generate a transaction on the ro device, - * which would lead to an I/O error and shutdown + * If the device itself is read-only and/or in norecovery + * mode, we can't allow the user to change the state of + * quota on the mount - this would generate a transaction + * on the ro device, which would lead to an I/O error and + * shutdown. */ - if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || - (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || - (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || - (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) || - (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || - (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) && - xfs_dev_is_read_only(mp, "changing quota state")) { - xfs_warn(mp, "please mount with%s%s%s%s.", - (!quotaondisk ? "out quota" : ""), - (uquotaondisk ? " usrquota" : ""), - (gquotaondisk ? " grpquota" : ""), - (pquotaondisk ? " prjquota" : "")); + if (xfs_qm_validate_state_change(mp, uquotaondisk, + gquotaondisk, pquotaondisk)) { + + if (xfs_has_metadir(mp)) + xfs_warn(mp, + "metadir enabled, please mount without any quota mount options"); + else + xfs_warn(mp, "please mount with%s%s%s%s.", + (!quotaondisk ? "out quota" : ""), + (uquotaondisk ? " usrquota" : ""), + (gquotaondisk ? " grpquota" : ""), + (pquotaondisk ? " prjquota" : "")); return -EPERM; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d92d7a07ea89..0055066fb1d9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1661,8 +1661,12 @@ xfs_fs_fill_super( #endif } - /* Filesystem claims it needs repair, so refuse the mount. */ - if (xfs_has_needsrepair(mp)) { + /* + * Filesystem claims it needs repair, so refuse the mount unless + * norecovery is also specified, in which case the filesystem can + * be mounted with no risk of further damage. + */ + if (xfs_has_needsrepair(mp) && !xfs_has_norecovery(mp)) { xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair."); error = -EFSCORRUPTED; goto out_free_sb; diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index c84df23b494d..751dc74a3067 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -66,7 +66,7 @@ xfs_deprecated_dointvec_minmax( return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); } -static struct ctl_table xfs_table[] = { +static const struct ctl_table xfs_table[] = { { .procname = "irix_sgid_inherit", .data = &xfs_params.sgid_inherit.val, |