diff options
Diffstat (limited to 'fs')
369 files changed, 9868 insertions, 6546 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index de009a33e0e2..f84412290a30 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -131,10 +131,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) } } spin_unlock(&dentry->d_lock); - } else { - if (dentry->d_inode) - ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any); } + if (!ret && dentry->d_inode) + ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any); return ret; } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 1775fcc7f0e8..698c43dd5dc8 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -179,14 +179,16 @@ extern int v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); -extern struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid, - bool new); +extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb, int new); extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; extern const struct netfs_request_ops v9fs_req_ops; -extern struct inode *v9fs_fid_iget_dotl(struct super_block *sb, - struct p9_fid *fid, bool new); +extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb, int new); /* other default globals */ #define V9FS_PORT 564 @@ -225,12 +227,30 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) */ static inline struct inode * v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb, bool new) + struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) - return v9fs_fid_iget_dotl(sb, fid, new); + return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0); else - return v9fs_fid_iget(sb, fid, new); + return v9fs_inode_from_fid(v9ses, fid, sb, 0); +} + +/** + * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by + * issuing a attribute request + * @v9ses: session information + * @fid: fid to issue attribute request for + * @sb: superblock on which to create inode + * + */ +static inline struct inode * +v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + if (v9fs_proto_dotl(v9ses)) + return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1); + else + return v9fs_inode_from_fid(v9ses, fid, sb, 1); } #endif diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 7923c3c347cb..d3aefbec4de6 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -42,7 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_free_inode(struct inode *inode); void v9fs_set_netfs_context(struct inode *inode); int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev); + struct inode *inode, umode_t mode, dev_t rdev); void v9fs_evict_inode(struct inode *inode); #if (BITS_PER_LONG == 32) #define QID2INO(q) ((ino_t) (((q)->path+2) ^ (((q)->path) >> 32))) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index fd72fc38c8f5..3e68521f4e2f 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -256,12 +256,9 @@ void v9fs_set_netfs_context(struct inode *inode) } int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev) + struct inode *inode, umode_t mode, dev_t rdev) { int err = 0; - struct v9fs_inode *v9inode = V9FS_I(inode); - - memcpy(&v9inode->qid, qid, sizeof(struct p9_qid)); inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_blocks = 0; @@ -365,59 +362,105 @@ void v9fs_evict_inode(struct inode *inode) clear_inode(inode); } -struct inode * -v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid, bool new) +static int v9fs_test_inode(struct inode *inode, void *data) +{ + int umode; + dev_t rdev; + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_wstat *st = (struct p9_wstat *)data; + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); + + umode = p9mode2unixmode(v9ses, st, &rdev); + /* don't match inode of different type */ + if (inode_wrong_type(inode, umode)) + return 0; + + /* compare qid details */ + if (memcmp(&v9inode->qid.version, + &st->qid.version, sizeof(v9inode->qid.version))) + return 0; + + if (v9inode->qid.type != st->qid.type) + return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; + return 1; +} + +static int v9fs_test_new_inode(struct inode *inode, void *data) +{ + return 0; +} + +static int v9fs_set_inode(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_wstat *st = (struct p9_wstat *)data; + + memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); + return 0; +} + +static struct inode *v9fs_qid_iget(struct super_block *sb, + struct p9_qid *qid, + struct p9_wstat *st, + int new) { dev_t rdev; int retval; umode_t umode; struct inode *inode; - struct p9_wstat *st; struct v9fs_session_info *v9ses = sb->s_fs_info; + int (*test)(struct inode *inode, void *data); - inode = iget_locked(sb, QID2INO(&fid->qid)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { - if (!new) { - goto done; - } else { - p9_debug(P9_DEBUG_VFS, "WARNING: Inode collision %ld\n", - inode->i_ino); - iput(inode); - remove_inode_hash(inode); - inode = iget_locked(sb, QID2INO(&fid->qid)); - WARN_ON(!(inode->i_state & I_NEW)); - } - } + if (new) + test = v9fs_test_new_inode; + else + test = v9fs_test_inode; + inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode, st); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; /* * initialize the inode with the stat info * FIXME!! we may need support for stale inodes * later. */ - st = p9_client_stat(fid); - if (IS_ERR(st)) { - retval = PTR_ERR(st); - goto error; - } - + inode->i_ino = QID2INO(qid); umode = p9mode2unixmode(v9ses, st, &rdev); - retval = v9fs_init_inode(v9ses, inode, &fid->qid, umode, rdev); - v9fs_stat2inode(st, inode, sb, 0); - p9stat_free(st); - kfree(st); + retval = v9fs_init_inode(v9ses, inode, umode, rdev); if (retval) goto error; + v9fs_stat2inode(st, inode, sb, 0); v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); -done: return inode; error: iget_failed(inode); return ERR_PTR(retval); + +} + +struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb, int new) +{ + struct p9_wstat *st; + struct inode *inode = NULL; + + st = p9_client_stat(fid); + if (IS_ERR(st)) + return ERR_CAST(st); + + inode = v9fs_qid_iget(sb, &st->qid, st, new); + p9stat_free(st); + kfree(st); + return inode; } /** @@ -449,15 +492,8 @@ static int v9fs_at_to_dotl_flags(int flags) */ static void v9fs_dec_count(struct inode *inode) { - if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) { - if (inode->i_nlink) { - drop_nlink(inode); - } else { - p9_debug(P9_DEBUG_VFS, - "WARNING: unexpected i_nlink zero %d inode %ld\n", - inode->i_nlink, inode->i_ino); - } - } + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); } /** @@ -508,9 +544,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) } else v9fs_dec_count(inode); - if (inode->i_nlink <= 0) /* no more refs unhash it */ - remove_inode_hash(inode); - v9fs_invalidate_inode_attr(inode); v9fs_invalidate_inode_attr(dir); @@ -576,7 +609,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, /* * instantiate inode and assign the unopened fid to the dentry */ - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, true); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, @@ -704,8 +737,10 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, inode = NULL; else if (IS_ERR(fid)) inode = ERR_CAST(fid); + else if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); else - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, false); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); /* * If we had a rename on the server and a parallel lookup * for the new name, then make sure we instantiate with diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index c61b97bd13b9..143ac03b7425 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -52,50 +52,80 @@ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) return current_fsgid(); } +static int v9fs_test_inode_dotl(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; + /* don't match inode of different type */ + if (inode_wrong_type(inode, st->st_mode)) + return 0; -struct inode * -v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new) + if (inode->i_generation != st->st_gen) + return 0; + + /* compare qid details */ + if (memcmp(&v9inode->qid.version, + &st->qid.version, sizeof(v9inode->qid.version))) + return 0; + + if (v9inode->qid.type != st->qid.type) + return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; + return 1; +} + +/* Always get a new inode */ +static int v9fs_test_new_inode_dotl(struct inode *inode, void *data) +{ + return 0; +} + +static int v9fs_set_inode_dotl(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; + + memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); + inode->i_generation = st->st_gen; + return 0; +} + +static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, + struct p9_qid *qid, + struct p9_fid *fid, + struct p9_stat_dotl *st, + int new) { int retval; struct inode *inode; - struct p9_stat_dotl *st; struct v9fs_session_info *v9ses = sb->s_fs_info; + int (*test)(struct inode *inode, void *data); - inode = iget_locked(sb, QID2INO(&fid->qid)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { - if (!new) { - goto done; - } else { /* deal with race condition in inode number reuse */ - p9_debug(P9_DEBUG_ERROR, "WARNING: Inode collision %lx\n", - inode->i_ino); - iput(inode); - remove_inode_hash(inode); - inode = iget_locked(sb, QID2INO(&fid->qid)); - WARN_ON(!(inode->i_state & I_NEW)); - } - } + if (new) + test = v9fs_test_new_inode_dotl; + else + test = v9fs_test_inode_dotl; + inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode_dotl, st); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; /* * initialize the inode with the stat info * FIXME!! we may need support for stale inodes * later. */ - st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); - if (IS_ERR(st)) { - retval = PTR_ERR(st); - goto error; - } - - retval = v9fs_init_inode(v9ses, inode, &fid->qid, + inode->i_ino = QID2INO(qid); + retval = v9fs_init_inode(v9ses, inode, st->st_mode, new_decode_dev(st->st_rdev)); - v9fs_stat2inode_dotl(st, inode, 0); - kfree(st); if (retval) goto error; + v9fs_stat2inode_dotl(st, inode, 0); v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); @@ -103,11 +133,27 @@ v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new) goto error; unlock_new_inode(inode); -done: return inode; error: iget_failed(inode); return ERR_PTR(retval); + +} + +struct inode * +v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb, int new) +{ + struct p9_stat_dotl *st; + struct inode *inode = NULL; + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); + if (IS_ERR(st)) + return ERR_CAST(st); + + inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new); + kfree(st); + return inode; } struct dotl_openflag_map { @@ -259,7 +305,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto out; } - inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); @@ -309,6 +355,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, umode_t omode) { int err; + struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; kgid_t gid; const unsigned char *name; @@ -318,6 +365,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); + v9ses = v9fs_inode2v9ses(dir); omode |= S_IFDIR; if (dir->i_mode & S_ISGID) @@ -352,7 +400,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, } /* instantiate inode and assign the unopened fid to the dentry */ - inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", @@ -749,6 +797,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, kgid_t gid; const unsigned char *name; umode_t mode; + struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; struct p9_qid qid; @@ -758,6 +807,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); + v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); @@ -788,7 +838,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, err); goto error; } - inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index f52fdf42945c..489db161abc9 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -139,7 +139,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, else sb->s_d_op = &v9fs_dentry_operations; - inode = v9fs_get_inode_from_fid(v9ses, fid, sb, true); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb); if (IS_ERR(inode)) { retval = PTR_ERR(inode); goto release_sb; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index f0b999a4961b..017c48a80203 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -6,7 +6,8 @@ */ #include <linux/module.h> #include <linux/init.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -115,87 +116,61 @@ static int adfs_show_options(struct seq_file *seq, struct dentry *root) return 0; } -enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err}; +enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix}; -static const match_table_t tokens = { - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_ownmask, "ownmask=%o"}, - {Opt_othmask, "othmask=%o"}, - {Opt_ftsuffix, "ftsuffix=%u"}, - {Opt_err, NULL} +static const struct fs_parameter_spec adfs_param_spec[] = { + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_u32oct ("ownmask", Opt_ownmask), + fsparam_u32oct ("othmask", Opt_othmask), + fsparam_u32 ("ftsuffix", Opt_ftsuffix), + {} }; -static int parse_options(struct super_block *sb, struct adfs_sb_info *asb, - char *options) +static int adfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - int option; - - if (!options) - return 0; - - while ((p = strsep(&options, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(args, &option)) - return -EINVAL; - asb->s_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(asb->s_uid)) - return -EINVAL; - break; - case Opt_gid: - if (match_int(args, &option)) - return -EINVAL; - asb->s_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(asb->s_gid)) - return -EINVAL; - break; - case Opt_ownmask: - if (match_octal(args, &option)) - return -EINVAL; - asb->s_owner_mask = option; - break; - case Opt_othmask: - if (match_octal(args, &option)) - return -EINVAL; - asb->s_other_mask = option; - break; - case Opt_ftsuffix: - if (match_int(args, &option)) - return -EINVAL; - asb->s_ftsuffix = option; - break; - default: - adfs_msg(sb, KERN_ERR, - "unrecognised mount option \"%s\" or missing value", - p); - return -EINVAL; - } + struct adfs_sb_info *asb = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, adfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + asb->s_uid = result.uid; + break; + case Opt_gid: + asb->s_gid = result.gid; + break; + case Opt_ownmask: + asb->s_owner_mask = result.uint_32; + break; + case Opt_othmask: + asb->s_other_mask = result.uint_32; + break; + case Opt_ftsuffix: + asb->s_ftsuffix = result.uint_32; + break; + default: + return -EINVAL; } return 0; } -static int adfs_remount(struct super_block *sb, int *flags, char *data) +static int adfs_reconfigure(struct fs_context *fc) { - struct adfs_sb_info temp_asb; - int ret; + struct adfs_sb_info *new_asb = fc->s_fs_info; + struct adfs_sb_info *asb = ADFS_SB(fc->root->d_sb); - sync_filesystem(sb); - *flags |= ADFS_SB_FLAGS; + sync_filesystem(fc->root->d_sb); + fc->sb_flags |= ADFS_SB_FLAGS; - temp_asb = *ADFS_SB(sb); - ret = parse_options(sb, &temp_asb, data); - if (ret == 0) - *ADFS_SB(sb) = temp_asb; + /* Structure copy newly parsed options */ + *asb = *new_asb; - return ret; + return 0; } static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -273,7 +248,6 @@ static const struct super_operations adfs_sops = { .write_inode = adfs_write_inode, .put_super = adfs_put_super, .statfs = adfs_statfs, - .remount_fs = adfs_remount, .show_options = adfs_show_options, }; @@ -361,34 +335,21 @@ static int adfs_validate_dr0(struct super_block *sb, struct buffer_head *bh, return 0; } -static int adfs_fill_super(struct super_block *sb, void *data, int silent) +static int adfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct adfs_discrecord *dr; struct object_info root_obj; - struct adfs_sb_info *asb; + struct adfs_sb_info *asb = sb->s_fs_info; struct inode *root; int ret = -EINVAL; + int silent = fc->sb_flags & SB_SILENT; sb->s_flags |= ADFS_SB_FLAGS; - asb = kzalloc(sizeof(*asb), GFP_KERNEL); - if (!asb) - return -ENOMEM; - sb->s_fs_info = asb; sb->s_magic = ADFS_SUPER_MAGIC; sb->s_time_gran = 10000000; - /* set default options */ - asb->s_uid = GLOBAL_ROOT_UID; - asb->s_gid = GLOBAL_ROOT_GID; - asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; - asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; - asb->s_ftsuffix = 0; - - if (parse_options(sb, asb, data)) - goto error; - /* Try to probe the filesystem boot block */ ret = adfs_probe(sb, ADFS_DISCRECORD, 1, adfs_validate_bblk); if (ret == -EILSEQ) @@ -453,18 +414,61 @@ error: return ret; } -static struct dentry *adfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int adfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, adfs_fill_super); +} + +static void adfs_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super); + struct adfs_context *asb = fc->s_fs_info; + + kfree(asb); +} + +static const struct fs_context_operations adfs_context_ops = { + .parse_param = adfs_parse_param, + .get_tree = adfs_get_tree, + .reconfigure = adfs_reconfigure, + .free = adfs_free_fc, +}; + +static int adfs_init_fs_context(struct fs_context *fc) +{ + struct adfs_sb_info *asb; + + asb = kzalloc(sizeof(struct adfs_sb_info), GFP_KERNEL); + if (!asb) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct adfs_sb_info *old_asb = ADFS_SB(sb); + + /* structure copy existing options before parsing */ + *asb = *old_asb; + } else { + /* set default options */ + asb->s_uid = GLOBAL_ROOT_UID; + asb->s_gid = GLOBAL_ROOT_GID; + asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; + asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; + asb->s_ftsuffix = 0; + } + + fc->ops = &adfs_context_ops; + fc->s_fs_info = asb; + + return 0; } static struct file_system_type adfs_fs_type = { .owner = THIS_MODULE, .name = "adfs", - .mount = adfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = adfs_init_fs_context, + .parameters = adfs_param_spec, }; MODULE_ALIAS_FS("adfs"); diff --git a/fs/affs/super.c b/fs/affs/super.c index 3c5821339609..2fa40337776d 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -14,7 +14,8 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/statfs.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include <linux/magic.h> #include <linux/sched.h> #include <linux/cred.h> @@ -27,7 +28,6 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); static int affs_show_options(struct seq_file *m, struct dentry *root); -static int affs_remount (struct super_block *sb, int *flags, char *data); static void affs_commit_super(struct super_block *sb, int wait) @@ -155,140 +155,114 @@ static const struct super_operations affs_sops = { .put_super = affs_put_super, .sync_fs = affs_sync_fs, .statfs = affs_statfs, - .remount_fs = affs_remount, .show_options = affs_show_options, }; enum { Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect, Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, - Opt_verbose, Opt_volume, Opt_ignore, Opt_err, + Opt_verbose, Opt_volume, Opt_ignore, }; -static const match_table_t tokens = { - {Opt_bs, "bs=%u"}, - {Opt_mode, "mode=%o"}, - {Opt_mufs, "mufs"}, - {Opt_notruncate, "nofilenametruncate"}, - {Opt_prefix, "prefix=%s"}, - {Opt_protect, "protect"}, - {Opt_reserved, "reserved=%u"}, - {Opt_root, "root=%u"}, - {Opt_setgid, "setgid=%u"}, - {Opt_setuid, "setuid=%u"}, - {Opt_verbose, "verbose"}, - {Opt_volume, "volume=%s"}, - {Opt_ignore, "grpquota"}, - {Opt_ignore, "noquota"}, - {Opt_ignore, "quota"}, - {Opt_ignore, "usrquota"}, - {Opt_err, NULL}, +struct affs_context { + kuid_t uid; /* uid to override */ + kgid_t gid; /* gid to override */ + unsigned int mode; /* mode to override */ + unsigned int reserved; /* Number of reserved blocks */ + int root_block; /* FFS root block number */ + int blocksize; /* Initial device blksize */ + char *prefix; /* Prefix for volumes and assigns */ + char volume[32]; /* Vol. prefix for absolute symlinks */ + unsigned long mount_flags; /* Options */ }; -static int -parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root, - int *blocksize, char **prefix, char *volume, unsigned long *mount_opts) +static const struct fs_parameter_spec affs_param_spec[] = { + fsparam_u32 ("bs", Opt_bs), + fsparam_u32oct ("mode", Opt_mode), + fsparam_flag ("mufs", Opt_mufs), + fsparam_flag ("nofilenametruncate", Opt_notruncate), + fsparam_string ("prefix", Opt_prefix), + fsparam_flag ("protect", Opt_protect), + fsparam_u32 ("reserved", Opt_reserved), + fsparam_u32 ("root", Opt_root), + fsparam_gid ("setgid", Opt_setgid), + fsparam_uid ("setuid", Opt_setuid), + fsparam_flag ("verbose", Opt_verbose), + fsparam_string ("volume", Opt_volume), + fsparam_flag ("grpquota", Opt_ignore), + fsparam_flag ("noquota", Opt_ignore), + fsparam_flag ("quota", Opt_ignore), + fsparam_flag ("usrquota", Opt_ignore), + {}, +}; + +static int affs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - - /* Fill in defaults */ - - *uid = current_uid(); - *gid = current_gid(); - *reserved = 2; - *root = -1; - *blocksize = -1; - volume[0] = ':'; - volume[1] = 0; - *mount_opts = 0; - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token, n, option; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_bs: - if (match_int(&args[0], &n)) - return 0; - if (n != 512 && n != 1024 && n != 2048 - && n != 4096) { - pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); - return 0; - } - *blocksize = n; - break; - case Opt_mode: - if (match_octal(&args[0], &option)) - return 0; - *mode = option & 0777; - affs_set_opt(*mount_opts, SF_SETMODE); - break; - case Opt_mufs: - affs_set_opt(*mount_opts, SF_MUFS); - break; - case Opt_notruncate: - affs_set_opt(*mount_opts, SF_NO_TRUNCATE); - break; - case Opt_prefix: - kfree(*prefix); - *prefix = match_strdup(&args[0]); - if (!*prefix) - return 0; - affs_set_opt(*mount_opts, SF_PREFIX); - break; - case Opt_protect: - affs_set_opt(*mount_opts, SF_IMMUTABLE); - break; - case Opt_reserved: - if (match_int(&args[0], reserved)) - return 0; - break; - case Opt_root: - if (match_int(&args[0], root)) - return 0; - break; - case Opt_setgid: - if (match_int(&args[0], &option)) - return 0; - *gid = make_kgid(current_user_ns(), option); - if (!gid_valid(*gid)) - return 0; - affs_set_opt(*mount_opts, SF_SETGID); - break; - case Opt_setuid: - if (match_int(&args[0], &option)) - return 0; - *uid = make_kuid(current_user_ns(), option); - if (!uid_valid(*uid)) - return 0; - affs_set_opt(*mount_opts, SF_SETUID); - break; - case Opt_verbose: - affs_set_opt(*mount_opts, SF_VERBOSE); - break; - case Opt_volume: { - char *vol = match_strdup(&args[0]); - if (!vol) - return 0; - strscpy(volume, vol, 32); - kfree(vol); - break; - } - case Opt_ignore: - /* Silently ignore the quota options */ - break; - default: - pr_warn("Unrecognized mount option \"%s\" or missing value\n", - p); - return 0; + struct affs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int n; + int opt; + + opt = fs_parse(fc, affs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_bs: + n = result.uint_32; + if (n != 512 && n != 1024 && n != 2048 + && n != 4096) { + pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); + return -EINVAL; } + ctx->blocksize = n; + break; + case Opt_mode: + ctx->mode = result.uint_32 & 0777; + affs_set_opt(ctx->mount_flags, SF_SETMODE); + break; + case Opt_mufs: + affs_set_opt(ctx->mount_flags, SF_MUFS); + break; + case Opt_notruncate: + affs_set_opt(ctx->mount_flags, SF_NO_TRUNCATE); + break; + case Opt_prefix: + kfree(ctx->prefix); + ctx->prefix = param->string; + param->string = NULL; + affs_set_opt(ctx->mount_flags, SF_PREFIX); + break; + case Opt_protect: + affs_set_opt(ctx->mount_flags, SF_IMMUTABLE); + break; + case Opt_reserved: + ctx->reserved = result.uint_32; + break; + case Opt_root: + ctx->root_block = result.uint_32; + break; + case Opt_setgid: + ctx->gid = result.gid; + affs_set_opt(ctx->mount_flags, SF_SETGID); + break; + case Opt_setuid: + ctx->uid = result.uid; + affs_set_opt(ctx->mount_flags, SF_SETUID); + break; + case Opt_verbose: + affs_set_opt(ctx->mount_flags, SF_VERBOSE); + break; + case Opt_volume: + strscpy(ctx->volume, param->string, 32); + break; + case Opt_ignore: + /* Silently ignore the quota options */ + break; + default: + return -EINVAL; } - return 1; + return 0; } static int affs_show_options(struct seq_file *m, struct dentry *root) @@ -329,27 +303,22 @@ static int affs_show_options(struct seq_file *m, struct dentry *root) * hopefully have the guts to do so. Until then: sorry for the mess. */ -static int affs_fill_super(struct super_block *sb, void *data, int silent) +static int affs_fill_super(struct super_block *sb, struct fs_context *fc) { struct affs_sb_info *sbi; + struct affs_context *ctx = fc->fs_private; struct buffer_head *root_bh = NULL; struct buffer_head *boot_bh; struct inode *root_inode = NULL; - s32 root_block; + int silent = fc->sb_flags & SB_SILENT; int size, blocksize; u32 chksum; int num_bm; int i, j; - kuid_t uid; - kgid_t gid; - int reserved; - unsigned long mount_flags; int tmp_flags; /* fix remount prototype... */ u8 sig[4]; int ret; - pr_debug("read_super(%s)\n", data ? (const char *)data : "no options"); - sb->s_magic = AFFS_SUPER_MAGIC; sb->s_op = &affs_sops; sb->s_flags |= SB_NODIRATIME; @@ -369,19 +338,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock); - if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, - &blocksize,&sbi->s_prefix, - sbi->s_volume, &mount_flags)) { - pr_err("Error parsing options\n"); - return -EINVAL; - } - /* N.B. after this point s_prefix must be released */ + sbi->s_flags = ctx->mount_flags; + sbi->s_mode = ctx->mode; + sbi->s_uid = ctx->uid; + sbi->s_gid = ctx->gid; + sbi->s_reserved = ctx->reserved; + sbi->s_prefix = ctx->prefix; + ctx->prefix = NULL; + memcpy(sbi->s_volume, ctx->volume, 32); - sbi->s_flags = mount_flags; - sbi->s_mode = i; - sbi->s_uid = uid; - sbi->s_gid = gid; - sbi->s_reserved= reserved; + /* N.B. after this point s_prefix must be released */ /* Get the size of the device in 512-byte blocks. * If we later see that the partition uses bigger @@ -396,15 +362,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) i = bdev_logical_block_size(sb->s_bdev); j = PAGE_SIZE; + blocksize = ctx->blocksize; if (blocksize > 0) { i = j = blocksize; size = size / (blocksize / 512); } for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) { - sbi->s_root_block = root_block; - if (root_block < 0) - sbi->s_root_block = (reserved + size - 1) / 2; + sbi->s_root_block = ctx->root_block; + if (ctx->root_block < 0) + sbi->s_root_block = (ctx->reserved + size - 1) / 2; pr_debug("setting blocksize to %d\n", blocksize); affs_set_blocksize(sb, blocksize); sbi->s_partition_size = size; @@ -424,7 +391,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) "size=%d, reserved=%d\n", sb->s_id, sbi->s_root_block + num_bm, - blocksize, size, reserved); + ctx->blocksize, size, ctx->reserved); root_bh = affs_bread(sb, sbi->s_root_block + num_bm); if (!root_bh) continue; @@ -447,7 +414,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) got_root: /* Keep super block in cache */ sbi->s_root_bh = root_bh; - root_block = sbi->s_root_block; + ctx->root_block = sbi->s_root_block; /* Find out which kind of FS we have */ boot_bh = sb_bread(sb, 0); @@ -506,7 +473,7 @@ got_root: return -EINVAL; } - if (affs_test_opt(mount_flags, SF_VERBOSE)) { + if (affs_test_opt(ctx->mount_flags, SF_VERBOSE)) { u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0]; pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n", len > 31 ? 31 : len, @@ -528,7 +495,7 @@ got_root: /* set up enough so that it can read an inode */ - root_inode = affs_iget(sb, root_block); + root_inode = affs_iget(sb, ctx->root_block); if (IS_ERR(root_inode)) return PTR_ERR(root_inode); @@ -548,56 +515,43 @@ got_root: return 0; } -static int -affs_remount(struct super_block *sb, int *flags, char *data) +static int affs_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + struct affs_context *ctx = fc->fs_private; struct affs_sb_info *sbi = AFFS_SB(sb); - int blocksize; - kuid_t uid; - kgid_t gid; - int mode; - int reserved; - int root_block; - unsigned long mount_flags; int res = 0; - char volume[32]; - char *prefix = NULL; - - pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); sync_filesystem(sb); - *flags |= SB_NODIRATIME; - - memcpy(volume, sbi->s_volume, 32); - if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, - &blocksize, &prefix, volume, - &mount_flags)) { - kfree(prefix); - return -EINVAL; - } + fc->sb_flags |= SB_NODIRATIME; flush_delayed_work(&sbi->sb_work); - sbi->s_flags = mount_flags; - sbi->s_mode = mode; - sbi->s_uid = uid; - sbi->s_gid = gid; + /* + * NB: Historically, only mount_flags, mode, uid, gic, prefix, + * and volume are accepted during remount. + */ + sbi->s_flags = ctx->mount_flags; + sbi->s_mode = ctx->mode; + sbi->s_uid = ctx->uid; + sbi->s_gid = ctx->gid; /* protect against readers */ spin_lock(&sbi->symlink_lock); - if (prefix) { + if (ctx->prefix) { kfree(sbi->s_prefix); - sbi->s_prefix = prefix; + sbi->s_prefix = ctx->prefix; + ctx->prefix = NULL; } - memcpy(sbi->s_volume, volume, 32); + memcpy(sbi->s_volume, ctx->volume, 32); spin_unlock(&sbi->symlink_lock); - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (*flags & SB_RDONLY) + if (fc->sb_flags & SB_RDONLY) affs_free_bitmap(sb); else - res = affs_init_bitmap(sb, flags); + res = affs_init_bitmap(sb, &fc->sb_flags); return res; } @@ -624,10 +578,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static struct dentry *affs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int affs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super); + return get_tree_bdev(fc, affs_fill_super); } static void affs_kill_sb(struct super_block *sb) @@ -643,12 +596,61 @@ static void affs_kill_sb(struct super_block *sb) } } +static void affs_free_fc(struct fs_context *fc) +{ + struct affs_context *ctx = fc->fs_private; + + kfree(ctx->prefix); + kfree(ctx); +} + +static const struct fs_context_operations affs_context_ops = { + .parse_param = affs_parse_param, + .get_tree = affs_get_tree, + .reconfigure = affs_reconfigure, + .free = affs_free_fc, +}; + +static int affs_init_fs_context(struct fs_context *fc) +{ + struct affs_context *ctx; + + ctx = kzalloc(sizeof(struct affs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct affs_sb_info *sbi = AFFS_SB(sb); + + /* + * NB: historically, no options other than volume were + * preserved across a remount unless they were explicitly + * passed in. + */ + memcpy(ctx->volume, sbi->s_volume, 32); + } else { + ctx->uid = current_uid(); + ctx->gid = current_gid(); + ctx->reserved = 2; + ctx->root_block = -1; + ctx->blocksize = -1; + ctx->volume[0] = ':'; + } + + fc->ops = &affs_context_ops; + fc->fs_private = ctx; + + return 0; +} + static struct file_system_type affs_fs_type = { .owner = THIS_MODULE, .name = "affs", - .mount = affs_mount, .kill_sb = affs_kill_sb, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = affs_init_fs_context, + .parameters = affs_param_spec, }; MODULE_ALIAS_FS("affs"); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index f8622ed72e08..ada363af5aab 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -12,6 +12,7 @@ #include <linux/swap.h> #include <linux/ctype.h> #include <linux/sched.h> +#include <linux/iversion.h> #include <linux/task_io_accounting_ops.h> #include "internal.h" #include "afs_fs.h" @@ -1823,6 +1824,8 @@ error: static void afs_rename_success(struct afs_operation *op) { + struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + _enter("op=%08x", op->debug_id); op->ctime = op->file[0].scb.status.mtime_client; @@ -1832,6 +1835,22 @@ static void afs_rename_success(struct afs_operation *op) op->ctime = op->file[1].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[1]); } + + /* If we're moving a subdir between dirs, we need to update + * its DV counter too as the ".." will be altered. + */ + if (S_ISDIR(vnode->netfs.inode.i_mode) && + op->file[0].vnode != op->file[1].vnode) { + u64 new_dv; + + write_seqlock(&vnode->cb_lock); + + new_dv = vnode->status.data_version + 1; + vnode->status.data_version = new_dv; + inode_set_iversion_raw(&vnode->netfs.inode, new_dv); + + write_sequnlock(&vnode->cb_lock); + } } static void afs_rename_edit_dir(struct afs_operation *op) @@ -1873,6 +1892,12 @@ static void afs_rename_edit_dir(struct afs_operation *op) &vnode->fid, afs_edit_dir_for_rename_2); } + if (S_ISDIR(vnode->netfs.inode.i_mode) && + new_dvnode != orig_dvnode && + test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + afs_edit_dir_update_dotdot(vnode, new_dvnode, + afs_edit_dir_for_rename_sub); + new_inode = d_inode(new_dentry); if (new_inode) { spin_lock(&new_inode->i_lock); diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index a71bff10496b..fe223fb78111 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -127,10 +127,10 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) /* * Scan a directory block looking for a dirent of the right name. */ -static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name, +static int afs_dir_scan_block(const union afs_xdr_dir_block *block, const struct qstr *name, unsigned int blocknum) { - union afs_xdr_dirent *de; + const union afs_xdr_dirent *de; u64 bitmap; int d, len, n; @@ -492,3 +492,90 @@ error: clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); goto out_unmap; } + +/* + * Edit a subdirectory that has been moved between directories to update the + * ".." entry. + */ +void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, + enum afs_edit_dir_reason why) +{ + union afs_xdr_dir_block *block; + union afs_xdr_dirent *de; + struct folio *folio; + unsigned int nr_blocks, b; + pgoff_t index; + loff_t i_size; + int slot; + + _enter(""); + + i_size = i_size_read(&vnode->netfs.inode); + if (i_size < AFS_DIR_BLOCK_SIZE) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + return; + } + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + /* Find a block that has sufficient slots available. Each folio + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks; b++) { + index = b / AFS_DIR_BLOCKS_PER_PAGE; + folio = afs_dir_get_folio(vnode, index); + if (!folio) + goto error; + + block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio)); + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto invalidated; + + slot = afs_dir_scan_block(block, &dotdot_name, b); + if (slot >= 0) + goto found_dirent; + + kunmap_local(block); + folio_unlock(folio); + folio_put(folio); + } + + /* Didn't find the dirent to clobber. Download the directory again. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd, + 0, 0, 0, 0, ".."); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out; + +found_dirent: + de = &block->dirents[slot]; + de->u.vnode = htonl(new_dvnode->fid.vnode); + de->u.unique = htonl(new_dvnode->fid.unique); + + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot, + ntohl(de->u.vnode), ntohl(de->u.unique), ".."); + + kunmap_local(block); + folio_unlock(folio); + folio_put(folio); + inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); + +out: + _leave(""); + return; + +invalidated: + kunmap_local(block); + folio_unlock(folio); + folio_put(folio); + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval, + 0, 0, 0, 0, ".."); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out; + +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error, + 0, 0, 0, 0, ".."); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out; +} diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 6e1d3c4daf72..c9d620175e80 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -130,6 +130,7 @@ struct afs_call { wait_queue_head_t waitq; /* processes awaiting completion */ struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ + struct work_struct free_work; /* Deferred free processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* security for this call */ @@ -1072,6 +1073,8 @@ extern void afs_check_for_remote_deletion(struct afs_operation *); extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); +void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, + enum afs_edit_dir_reason why); /* * dir_silly.c @@ -1331,6 +1334,7 @@ extern int __net_init afs_open_socket(struct afs_net *); extern void __net_exit afs_close_socket(struct afs_net *); extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); +void afs_deferred_put_call(struct afs_call *call); void afs_make_call(struct afs_call *call, gfp_t gfp); void afs_wait_for_call_to_complete(struct afs_call *call); extern struct afs_call *afs_alloc_flat_call(struct afs_net *, diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index c453428f3c8b..9f2a3bb56ec6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -18,6 +18,7 @@ struct workqueue_struct *afs_async_calls; +static void afs_deferred_free_worker(struct work_struct *work); static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_process_async_call(struct work_struct *); @@ -149,6 +150,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, call->debug_id = atomic_inc_return(&rxrpc_debug_id); refcount_set(&call->ref, 1); INIT_WORK(&call->async_work, afs_process_async_call); + INIT_WORK(&call->free_work, afs_deferred_free_worker); init_waitqueue_head(&call->waitq); spin_lock_init(&call->state_lock); call->iter = &call->def_iter; @@ -159,6 +161,36 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, return call; } +static void afs_free_call(struct afs_call *call) +{ + struct afs_net *net = call->net; + int o; + + ASSERT(!work_pending(&call->async_work)); + + rxrpc_kernel_put_peer(call->peer); + + if (call->rxcall) { + rxrpc_kernel_shutdown_call(net->socket, call->rxcall); + rxrpc_kernel_put_call(net->socket, call->rxcall); + call->rxcall = NULL; + } + if (call->type->destructor) + call->type->destructor(call); + + afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); + kfree(call->request); + + o = atomic_read(&net->nr_outstanding_calls); + trace_afs_call(call->debug_id, afs_call_trace_free, 0, o, + __builtin_return_address(0)); + kfree(call); + + o = atomic_dec_return(&net->nr_outstanding_calls); + if (o == 0) + wake_up_var(&net->nr_outstanding_calls); +} + /* * Dispose of a reference on a call. */ @@ -173,32 +205,34 @@ void afs_put_call(struct afs_call *call) o = atomic_read(&net->nr_outstanding_calls); trace_afs_call(debug_id, afs_call_trace_put, r - 1, o, __builtin_return_address(0)); + if (zero) + afs_free_call(call); +} - if (zero) { - ASSERT(!work_pending(&call->async_work)); - ASSERT(call->type->name != NULL); - - rxrpc_kernel_put_peer(call->peer); - - if (call->rxcall) { - rxrpc_kernel_shutdown_call(net->socket, call->rxcall); - rxrpc_kernel_put_call(net->socket, call->rxcall); - call->rxcall = NULL; - } - if (call->type->destructor) - call->type->destructor(call); +static void afs_deferred_free_worker(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, free_work); - afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); - kfree(call->request); + afs_free_call(call); +} - trace_afs_call(call->debug_id, afs_call_trace_free, 0, o, - __builtin_return_address(0)); - kfree(call); +/* + * Dispose of a reference on a call, deferring the cleanup to a workqueue + * to avoid lock recursion. + */ +void afs_deferred_put_call(struct afs_call *call) +{ + struct afs_net *net = call->net; + unsigned int debug_id = call->debug_id; + bool zero; + int r, o; - o = atomic_dec_return(&net->nr_outstanding_calls); - if (o == 0) - wake_up_var(&net->nr_outstanding_calls); - } + zero = __refcount_dec_and_test(&call->ref, &r); + o = atomic_read(&net->nr_outstanding_calls); + trace_afs_call(debug_id, afs_call_trace_put, r - 1, o, + __builtin_return_address(0)); + if (zero) + schedule_work(&call->free_work); } static struct afs_call *afs_get_call(struct afs_call *call, @@ -640,7 +674,8 @@ static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall, } /* - * wake up an asynchronous call + * Wake up an asynchronous call. The caller is holding the call notify + * spinlock around this, so we can't call afs_put_call(). */ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long call_user_ID) @@ -657,7 +692,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, __builtin_return_address(0)); if (!queue_work(afs_async_calls, &call->async_work)) - afs_put_call(call); + afs_deferred_put_call(call); } } @@ -2191,7 +2191,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) { ret = kiocb->ki_cancel(&kiocb->rw); diff --git a/fs/attr.c b/fs/attr.c index c04d19b58f12..9caf63d20d03 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -272,6 +272,47 @@ out_big: EXPORT_SYMBOL(inode_newsize_ok); /** + * setattr_copy_mgtime - update timestamps for mgtime inodes + * @inode: inode timestamps to be updated + * @attr: attrs for the update + * + * With multigrain timestamps, take more care to prevent races when + * updating the ctime. Always update the ctime to the very latest using + * the standard mechanism, and use that to populate the atime and mtime + * appropriately (unless those are being set to specific values). + */ +static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + struct timespec64 now; + + if (ia_valid & ATTR_CTIME) { + /* + * In the case of an update for a write delegation, we must respect + * the value in ia_ctime and not use the current time. + */ + if (ia_valid & ATTR_DELEG) + now = inode_set_ctime_deleg(inode, attr->ia_ctime); + else + now = inode_set_ctime_current(inode); + } else { + /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */ + WARN_ON_ONCE(ia_valid & ATTR_MTIME); + now = current_time(inode); + } + + if (ia_valid & ATTR_ATIME_SET) + inode_set_atime_to_ts(inode, attr->ia_atime); + else if (ia_valid & ATTR_ATIME) + inode_set_atime_to_ts(inode, now); + + if (ia_valid & ATTR_MTIME_SET) + inode_set_mtime_to_ts(inode, attr->ia_mtime); + else if (ia_valid & ATTR_MTIME) + inode_set_mtime_to_ts(inode, now); +} + +/** * setattr_copy - copy simple metadata updates into the generic inode * @idmap: idmap of the mount the inode was found from * @inode: the inode to be updated @@ -303,12 +344,6 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); - if (ia_valid & ATTR_ATIME) - inode_set_atime_to_ts(inode, attr->ia_atime); - if (ia_valid & ATTR_MTIME) - inode_set_mtime_to_ts(inode, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) - inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; if (!in_group_or_capable(idmap, inode, @@ -316,6 +351,20 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, mode &= ~S_ISGID; inode->i_mode = mode; } + + if (is_mgtime(inode)) + return setattr_copy_mgtime(inode, attr); + + if (ia_valid & ATTR_ATIME) + inode_set_atime_to_ts(inode, attr->ia_atime); + if (ia_valid & ATTR_MTIME) + inode_set_mtime_to_ts(inode, attr->ia_mtime); + if (ia_valid & ATTR_CTIME) { + if (ia_valid & ATTR_DELEG) + inode_set_ctime_deleg(inode, attr->ia_ctime); + else + inode_set_ctime_to_ts(inode, attr->ia_ctime); + } } EXPORT_SYMBOL(setattr_copy); diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index f011e026358e..6d57efbb8110 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -110,6 +110,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) */ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { + unsigned int inr = _IOC_NR(cmd); int err; err = check_dev_ioctl_version(cmd, param); @@ -133,7 +134,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD. */ err = check_name(param->path); - if (cmd == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) + if (inr == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) err = err ? 0 : -EINVAL; if (err) { pr_warn("invalid path supplied for cmd(0x%08x)\n", @@ -141,8 +142,6 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) goto out; } } else { - unsigned int inr = _IOC_NR(cmd); - if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD || inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD || inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) { diff --git a/fs/backing-file.c b/fs/backing-file.c index 8860dac58c37..09a9be945d45 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -80,7 +80,7 @@ struct backing_aio { refcount_t ref; struct kiocb *orig_iocb; /* used for aio completion */ - void (*end_write)(struct file *); + void (*end_write)(struct file *, loff_t, ssize_t); struct work_struct work; long res; }; @@ -109,7 +109,7 @@ static void backing_aio_cleanup(struct backing_aio *aio, long res) struct kiocb *orig_iocb = aio->orig_iocb; if (aio->end_write) - aio->end_write(orig_iocb->ki_filp); + aio->end_write(orig_iocb->ki_filp, iocb->ki_pos, res); orig_iocb->ki_pos = iocb->ki_pos; backing_aio_put(aio); @@ -239,7 +239,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); if (ctx->end_write) - ctx->end_write(ctx->user_file); + ctx->end_write(ctx->user_file, iocb->ki_pos, ret); } else { struct backing_aio *aio; @@ -317,7 +317,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, revert_creds(old_cred); if (ctx->end_write) - ctx->end_write(ctx->user_file); + ctx->end_write(ctx->user_file, ppos ? *ppos : 0, ret); return ret; } diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 4e4a448f6931..c84a91572a1d 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -639,6 +639,16 @@ int bch2_alloc_read(struct bch_fs *c) continue; } + if (k.k->p.offset < ca->mi.first_bucket) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); + continue; + } + + if (k.k->p.offset >= ca->mi.nbuckets) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; 0; @@ -1967,7 +1977,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) ca->mi.bucket_size, GFP_KERNEL); - int ret = bch2_trans_do(c, NULL, NULL, + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_WATERMARK_btree| BCH_TRANS_COMMIT_no_enospc, bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); @@ -2127,14 +2137,15 @@ static void bch2_do_invalidates_work(struct work_struct *work) struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; if (ret) - break; + goto restart_err; if (!k.k) break; ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); +restart_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; if (ret) break; @@ -2340,24 +2351,19 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) /* Bucket IO clocks: */ -int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, - size_t bucket_nr, int rw) +static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; - u64 now; - int ret = 0; - - if (bch2_trans_relock(trans)) - bch2_trans_begin(trans); - a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); - ret = PTR_ERR_OR_ZERO(a); + struct btree_iter iter; + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); + int ret = PTR_ERR_OR_ZERO(a); if (ret) return ret; - now = bch2_current_io_time(c, rw); + u64 now = bch2_current_io_time(c, rw); if (a->v.io_time[rw] == now) goto out; @@ -2370,6 +2376,15 @@ out: return ret; } +int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) +{ + if (bch2_trans_relock(trans)) + bch2_trans_begin(trans); + + return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw)); +} + /* Startup/shutdown (ro/rw): */ void bch2_recalc_capacity(struct bch_fs *c) diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index f8e87c6721b1..163a67b97a40 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -168,6 +168,9 @@ static inline bool data_type_movable(enum bch_data_type type) static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, struct bch_dev *ca) { + if (a.data_type >= BCH_DATA_NR) + return 0; + if (!data_type_movable(a.data_type) || !bch2_bucket_sectors_fragmented(ca, a)) return 0; diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index d0e0b56892e3..372178c8d416 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -162,6 +162,10 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) ARRAY_SIZE(c->open_buckets_partial)); spin_lock(&c->freelist_lock); + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; + rcu_read_unlock(); + ob->on_partial_list = true; c->open_buckets_partial[c->open_buckets_partial_nr++] = ob - c->open_buckets; @@ -684,7 +688,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, struct bch_dev_usage usage; struct open_bucket *ob; - bch2_trans_do(c, NULL, NULL, 0, + bch2_trans_do(c, PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, false, &usage))); return ob; @@ -972,7 +976,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, u64 avail; bch2_dev_usage_read_fast(ca, &usage); - avail = dev_buckets_free(ca, usage, watermark); + avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; if (!avail) continue; @@ -981,6 +985,10 @@ static int bucket_alloc_set_partial(struct bch_fs *c, i); ob->on_partial_list = false; + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; + rcu_read_unlock(); + ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, have_cache, ob); @@ -1191,7 +1199,13 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, --c->open_buckets_partial_nr; swap(c->open_buckets_partial[i], c->open_buckets_partial[c->open_buckets_partial_nr]); + ob->on_partial_list = false; + + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; + rcu_read_unlock(); + spin_unlock(&c->freelist_lock); bch2_open_bucket_put(c, ob); spin_lock(&c->freelist_lock); @@ -1610,8 +1624,7 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list && - (!ca || ob->dev == ca->dev_idx)) + if (ob->valid && (!ca || ob->dev == ca->dev_idx)) bch2_open_bucket_to_text(out, c, ob); spin_unlock(&ob->lock); } diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 47455a85c909..654a58132a4d 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -52,6 +52,12 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + int ret = 0; + + bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH, + c, backpointer_level_bad, + "backpointer level bad: %u >= %u", + bp.v->level, BTREE_MAX_DEPTH); rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); @@ -64,7 +70,6 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p); struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset); rcu_read_unlock(); - int ret = 0; bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || !bpos_eq(bp.k->p, bp_pos), @@ -947,9 +952,13 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, - struct bkey_s_c_backpointer bp, + struct bkey_s_c bp_k, struct bkey_buf *last_flushed) { + if (bp_k.k->type != KEY_TYPE_backpointer) + return 0; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); struct bch_fs *c = trans->c; struct btree_iter iter; struct bbpos pos = bp_to_bbpos(*bp.v); @@ -1004,9 +1013,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); - check_one_backpointer(trans, start, end, - bkey_s_c_to_backpointer(k), - &last_flushed); + check_one_backpointer(trans, start, end, k, &last_flushed); })); bch2_bkey_buf_exit(&last_flushed, c); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index f4151ee51b03..e94a83b8113e 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -555,6 +555,7 @@ struct bch_dev { u64 alloc_cursor[3]; unsigned nr_open_buckets; + unsigned nr_partial_buckets; unsigned nr_btree_reserve; size_t inc_gen_needs_gc; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 84832c2d4df9..5004f6ba997c 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -678,7 +678,8 @@ struct bch_sb_field_ext { x(disk_accounting_v2, BCH_VERSION(1, 9)) \ x(disk_accounting_v3, BCH_VERSION(1, 10)) \ x(disk_accounting_inum, BCH_VERSION(1, 11)) \ - x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) + x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ + x(inode_has_child_snapshots, BCH_VERSION(1, 13)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 587d7318a2e8..995ba32e9b6e 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -643,7 +643,7 @@ int bch2_bkey_format_invalid(struct bch_fs *c, enum bch_validate_flags flags, struct printbuf *err) { - unsigned i, bits = KEY_PACKED_BITS_START; + unsigned bits = KEY_PACKED_BITS_START; if (f->nr_fields != BKEY_NR_FIELDS) { prt_printf(err, "incorrect number of fields: got %u, should be %u", @@ -655,9 +655,8 @@ int bch2_bkey_format_invalid(struct bch_fs *c, * Verify that the packed format can't represent fields larger than the * unpacked format: */ - for (i = 0; i < f->nr_fields; i++) { - if ((!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) && - bch2_bkey_format_field_overflows(f, i)) { + for (unsigned i = 0; i < f->nr_fields; i++) { + if (bch2_bkey_format_field_overflows(f, i)) { unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); unsigned packed_bits = min(64, f->bits_per_field[i]); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 6e4afb2b5441..7123019ab3bc 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -59,16 +59,38 @@ static inline size_t btree_cache_can_free(struct btree_cache_list *list) static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) { + BUG_ON(!list_empty(&b->list)); + if (b->c.lock.readers) - list_move(&b->list, &bc->freed_pcpu); + list_add(&b->list, &bc->freed_pcpu); else - list_move(&b->list, &bc->freed_nonpcpu); + list_add(&b->list, &bc->freed_nonpcpu); +} + +static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(!list_empty(&b->list)); + BUG_ON(!b->data); + + bc->nr_freeable++; + list_add(&b->list, &bc->freeable); } -static void btree_node_data_free(struct bch_fs *c, struct btree *b) +void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; + mutex_lock(&bc->lock); + __bch2_btree_node_to_freelist(bc, b); + mutex_unlock(&bc->lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +} + +static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); /* @@ -94,11 +116,17 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) #endif b->aux_data = NULL; - bc->nr_freeable--; - btree_node_to_freedlist(bc, b); } +static void btree_node_data_free(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(list_empty(&b->list)); + list_del_init(&b->list); + --bc->nr_freeable; + __btree_node_data_free(bc, b); +} + static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -174,21 +202,10 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) bch2_btree_lock_init(&b->c, 0); - bc->nr_freeable++; - list_add(&b->list, &bc->freeable); + __bch2_btree_node_to_freelist(bc, b); return b; } -void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) -{ - mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); - mutex_unlock(&c->btree_cache.lock); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); -} - static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) { struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); @@ -236,11 +253,11 @@ void bch2_btree_cache_unpin(struct bch_fs *c) /* Btree in memory cache - hash table */ -void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { lockdep_assert_held(&bc->lock); - int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); BUG_ON(ret); /* Cause future lookups for this node to fail: */ @@ -248,17 +265,22 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) if (b->c.btree_id < BTREE_ID_NR) --bc->nr_by_btree[b->c.btree_id]; + --bc->live[btree_node_pinned(b)].nr; + list_del_init(&b->list); +} - bc->live[btree_node_pinned(b)].nr--; - bc->nr_freeable++; - list_move(&b->list, &bc->freeable); +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +{ + __bch2_btree_node_hash_remove(bc, b); + __bch2_btree_node_to_freelist(bc, b); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) { + BUG_ON(!list_empty(&b->list)); BUG_ON(b->hash_val); - b->hash_val = btree_ptr_hash_val(&b->key); + b->hash_val = btree_ptr_hash_val(&b->key); int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params); if (ret) @@ -270,10 +292,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) bool p = __btree_node_pinned(bc, b); mod_bit(BTREE_NODE_pinned, &b->flags, p); - list_move_tail(&b->list, &bc->live[p].list); + list_add_tail(&b->list, &bc->live[p].list); bc->live[p].nr++; - - bc->nr_freeable--; return 0; } @@ -485,7 +505,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, goto out; if (!btree_node_reclaim(c, b, true)) { - btree_node_data_free(c, b); + btree_node_data_free(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; @@ -501,10 +521,10 @@ restart: bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; --touched;; } else if (!btree_node_reclaim(c, b, true)) { - bch2_btree_node_hash_remove(bc, b); + __bch2_btree_node_hash_remove(bc, b); + __btree_node_data_free(bc, b); freed++; - btree_node_data_free(c, b); bc->nr_freed++; six_unlock_write(&b->c.lock); @@ -587,7 +607,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); - btree_node_data_free(c, b); + btree_node_data_free(bc, b); } BUG_ON(!bch2_journal_error(&c->journal) && @@ -786,8 +806,8 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); -got_node: +got_node: /* * btree_free() doesn't free memory; it sticks the node on the end of * the list. Check if there's any freed nodes there: @@ -796,7 +816,12 @@ got_node: if (!btree_node_reclaim(c, b2, false)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); + + list_del_init(&b2->list); + --bc->nr_freeable; btree_node_to_freedlist(bc, b2); + mutex_unlock(&bc->lock); + six_unlock_write(&b2->c.lock); six_unlock_intent(&b2->c.lock); goto got_mem; @@ -810,11 +835,8 @@ got_node: goto err; } - mutex_lock(&bc->lock); - bc->nr_freeable++; got_mem: - mutex_unlock(&bc->lock); - + BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_write_in_flight(b)); @@ -845,7 +867,7 @@ err: if (bc->alloc_lock == current) { b2 = btree_node_cannibalize(c); clear_btree_node_just_written(b2); - bch2_btree_node_hash_remove(bc, b2); + __bch2_btree_node_hash_remove(bc, b2); if (b) { swap(b->data, b2->data); @@ -855,9 +877,9 @@ err: six_unlock_intent(&b2->c.lock); } else { b = b2; - list_del_init(&b->list); } + BUG_ON(!list_empty(&b->list)); mutex_unlock(&bc->lock); trace_and_count(c, btree_cache_cannibalize, trans); @@ -936,7 +958,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, b->hash_val = 0; mutex_lock(&bc->lock); - list_add(&b->list, &bc->freeable); + __bch2_btree_node_to_freelist(bc, b); mutex_unlock(&bc->lock); six_unlock_write(&b->c.lock); @@ -1312,9 +1334,12 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); - if (!IS_ERR_OR_NULL(b)) + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret; + if (b) six_unlock_read(&b->c.lock); - return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b); + return 0; } void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) @@ -1353,7 +1378,7 @@ wait_on_io: mutex_lock(&bc->lock); bch2_btree_node_hash_remove(bc, b); - btree_node_data_free(c, b); + btree_node_data_free(bc, b); mutex_unlock(&bc->lock); out: six_unlock_write(&b->c.lock); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 367acd217c6a..66e86d1a178d 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -14,7 +14,9 @@ void bch2_recalc_btree_reserve(struct bch_fs *); void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *); +void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); + int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 771154e3a291..81dcf9e512c0 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -182,7 +182,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) bch2_btree_node_drop_keys_outside_node(b); mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, &new->k_i); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); @@ -820,12 +820,22 @@ static int bch2_alloc_write_key(struct btree_trans *trans, * fix that here: */ alloc_data_type_set(&gc, gc.data_type); - if (gc.data_type != old_gc.data_type || gc.dirty_sectors != old_gc.dirty_sectors) { ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc); if (ret) return ret; + + /* + * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not + * safe w.r.t. transaction restarts, so fixup the gc_bucket so + * we don't run it twice: + */ + percpu_down_read(&c->mark_lock); + struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); + gc_m->data_type = gc.data_type; + gc_m->dirty_sectors = gc.dirty_sectors; + percpu_up_read(&c->mark_lock); } if (fsck_err_on(new.data_type != gc.data_type, @@ -1224,17 +1234,20 @@ int bch2_gc_gens(struct bch_fs *c) u64 b, start_time = local_clock(); int ret; - /* - * Ideally we would be using state_lock and not gc_gens_lock here, but that - * introduces a deadlock in the RO path - we currently take the state - * lock at the start of going RO, thus the gc thread may get stuck: - */ if (!mutex_trylock(&c->gc_gens_lock)) return 0; trace_and_count(c, gc_gens_start, c); - down_read(&c->state_lock); + /* + * We have to use trylock here. Otherwise, we would + * introduce a deadlock in the RO path - we take the + * state lock at the start of going RO. + */ + if (!down_read_trylock(&c->state_lock)) { + mutex_unlock(&c->gc_gens_lock); + return 0; + } for_each_member_device(c, ca) { struct bucket_gens *gens = bucket_gens(ca); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 1c1448b52207..839d68802e42 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -733,11 +733,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, c, ca, b, i, NULL, bset_past_end_of_btree_node, "bset past end of btree node (offset %u len %u but written %zu)", - offset, sectors, ptr_written ?: btree_sectors(c))) { + offset, sectors, ptr_written ?: btree_sectors(c))) i->u64s = 0; - ret = 0; - goto out; - } btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, @@ -829,7 +826,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BSET_BIG_ENDIAN(i), write, &bn->format); } -out: fsck_err: printbuf_exit(&buf2); printbuf_exit(&buf1); @@ -1838,10 +1834,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) struct btree_trans *trans = bch2_trans_get(c); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - __btree_node_write_done(c, b); - six_unlock_read(&b->c.lock); + /* we don't need transaction context anymore after we got the lock. */ bch2_trans_put(trans); + __btree_node_write_done(c, b); + six_unlock_read(&b->c.lock); } static void btree_node_write_work(struct work_struct *work) @@ -1870,7 +1867,7 @@ static void btree_node_write_work(struct work_struct *work) } } else { - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_do(c, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_journal_reclaim| diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index bfe9f0c1e1be..eef9b89c561d 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -882,6 +882,18 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); k = bch2_btree_and_journal_iter_peek(&jiter); + if (!k.k) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " at btree "); + bch2_btree_pos_to_text(&buf, c, l->b); + + ret = bch2_fs_topology_error(c, "%s", buf.buf); + printbuf_exit(&buf); + goto err; + } bch2_bkey_buf_reassemble(out, c, k); @@ -889,6 +901,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, c->opts.btree_node_prefetch) ret = btree_path_prefetch_j(trans, path, &jiter); +err: bch2_btree_and_journal_iter_exit(&jiter); return ret; } @@ -2381,9 +2394,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e else iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - if (unlikely(!(iter->flags & BTREE_ITER_is_extents) - ? bkey_gt(iter_pos, end) - : bkey_ge(iter_pos, end))) + if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) : + iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) : + bkey_gt(iter_pos, end))) goto end; break; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 78e63ad7d380..0bda054f80d7 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -857,6 +857,14 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ SPOS_MAX, _flags, _k, _ret) +#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_rewind(&(_iter))) + #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) @@ -904,6 +912,8 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); _ret; \ }) +#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) + struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); void bch2_trans_put(struct btree_trans *); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 1e694fedc5da..30131c3bdd97 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -171,6 +171,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) return; + if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) + return; + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), @@ -183,7 +186,7 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, .ptrs[0].offset = offset, .ptrs[0].dev = ca->dev_idx, - .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), + .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)), }; rcu_read_unlock(); diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 514df618548e..5d809e8bd170 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -668,7 +668,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, struct disk_reservation *disk_res, int flags, enum btree_iter_update_trigger_flags iter_flags) { - return bch2_trans_do(c, disk_res, NULL, flags, + return bch2_trans_commit_do(c, disk_res, NULL, flags, bch2_btree_insert_trans(trans, id, k, iter_flags)); } @@ -865,7 +865,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, memcpy(l->d, buf.buf, buf.pos); c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { - ret = bch2_trans_do(c, NULL, NULL, + ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|commit_flags, __bch2_trans_log_msg(trans, &buf, u64s)); } diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 6a454f2fa005..70b3c989fac2 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -192,7 +192,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans, nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_flags))) -#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ +#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \ bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) #define trans_for_each_update(_trans, _i) \ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 190bc1e81756..d596ef93239f 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -237,10 +237,6 @@ static void __btree_node_free(struct btree_trans *trans, struct btree *b) BUG_ON(b->will_make_reachable); clear_btree_node_noevict(b); - - mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); - mutex_unlock(&c->btree_cache.lock); } static void bch2_btree_node_free_inmem(struct btree_trans *trans, @@ -252,12 +248,12 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, bch2_btree_node_lock_write_nofail(trans, path, &b->c); + __btree_node_free(trans, b); + mutex_lock(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); - __btree_node_free(trans, b); - six_unlock_write(&b->c.lock); mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); @@ -289,7 +285,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, clear_btree_node_need_write(b); mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); BUG_ON(p->nr >= ARRAY_SIZE(p->b)); @@ -521,8 +517,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); __btree_node_free(trans, b); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); + bch2_btree_node_to_freelist(c, b); } } } @@ -1434,6 +1429,15 @@ bch2_btree_insert_keys_interior(struct btree_update *as, } } +static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) +{ + if (insert_keys) + for_each_keylist_key(insert_keys, k) + if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos)) + return true; + return false; +} + /* * Move keys from n1 (original replacement node, now lower node) to n2 (higher * node) @@ -1441,7 +1445,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, static void __btree_split_node(struct btree_update *as, struct btree_trans *trans, struct btree *b, - struct btree *n[2]) + struct btree *n[2], + struct keylist *insert_keys) { struct bkey_packed *k; struct bpos n1_pos = POS_MIN; @@ -1476,7 +1481,8 @@ static void __btree_split_node(struct btree_update *as, if (b->c.level && u64s < n1_u64s && u64s + k->u64s >= n1_u64s && - bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p)) + (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) || + key_deleted_in_insert(insert_keys, uk.p))) n1_u64s += k->u64s; i = u64s >= n1_u64s; @@ -1603,7 +1609,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); - __btree_split_node(as, trans, b, n); + __btree_split_node(as, trans, b, n, keys); if (keys) { btree_split_insert_keys(as, trans, path, n1, keys); @@ -2239,10 +2245,8 @@ static void async_btree_node_rewrite_work(struct work_struct *work) struct async_btree_rewrite *a = container_of(work, struct async_btree_rewrite, work); struct bch_fs *c = a->c; - int ret; - ret = bch2_trans_do(c, NULL, NULL, 0, - async_btree_node_rewrite_trans(trans, a)); + int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); bch_err_fn_ratelimited(c, ret); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); @@ -2394,7 +2398,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, if (new_hash) { mutex_lock(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, new_hash); - bch2_btree_node_hash_remove(&c->btree_cache, b); + + __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, new_key); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 3f56b584f8ec..1639c60dffa0 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -277,6 +277,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); int ret = 0; + ret = bch2_journal_error(&c->journal); + if (ret) + return ret; + bch2_trans_unlock(trans); bch2_trans_begin(trans); @@ -491,7 +495,8 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) return ret; } -static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, + bool *did_work) { struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; @@ -502,6 +507,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); + *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; + /* * On memory allocation failure, bch2_btree_write_buffer_flush_locked() * is not guaranteed to empty wb->inc: @@ -521,17 +528,34 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j, struct journal_entry_pin *_pin, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool did_work = false; - return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq)); + return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work)); } int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) { struct bch_fs *c = trans->c; + bool did_work = false; trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); - return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal)); + return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work); +} + +/* + * The write buffer requires flushing when going RO: keys in the journal for the + * write buffer don't have a journal pin yet + */ +bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c) +{ + if (bch2_journal_error(&c->journal)) + return false; + + bool did_work = false; + bch2_trans_run(c, btree_write_buffer_flush_seq(trans, + journal_cur_seq(&c->journal), &did_work)); + return did_work; } int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index 725e79654216..d535cea28bde 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -21,6 +21,7 @@ static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) struct btree_trans; int bch2_btree_write_buffer_flush_sync(struct btree_trans *); +bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *); int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 546cd01a72e3..ec7d9a59bea9 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1160,11 +1160,11 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) #define SECTORS_CACHE 1024 int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, int flags) + u64 sectors, enum bch_reservation_flags flags) { struct bch_fs_pcpu *pcpu; u64 old, get; - s64 sectors_available; + u64 sectors_available; int ret; percpu_down_read(&c->mark_lock); @@ -1202,6 +1202,9 @@ recalculate: percpu_u64_set(&c->pcpu->sectors_available, 0); sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); + if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL)) + sectors = min(sectors, sectors_available); + if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { atomic64_set(&c->sectors_available, diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index e2cb7b24b220..ccc78bfe2fd4 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -103,12 +103,18 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) return gens->b + b; } -static inline u8 bucket_gen_get(struct bch_dev *ca, size_t b) +static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) +{ + u8 *gen = bucket_gen(ca, b); + return gen ? *gen : -1; +} + +static inline int bucket_gen_get(struct bch_dev *ca, size_t b) { rcu_read_lock(); - u8 gen = *bucket_gen(ca, b); + int ret = bucket_gen_get_rcu(ca, b); rcu_read_unlock(); - return gen; + return ret; } static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -169,10 +175,8 @@ static inline int gen_after(u8 a, u8 b) static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - u8 *gen = bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)); - if (!gen) - return -1; - return gen_after(*gen, ptr->gen); + int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr)); + return gen < 0 ? gen : gen_after(gen, ptr->gen); } /** @@ -184,7 +188,6 @@ static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr rcu_read_lock(); int ret = dev_ptr_stale_rcu(ca, ptr); rcu_read_unlock(); - return ret; } @@ -344,14 +347,16 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c, } } -#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) +enum bch_reservation_flags { + BCH_DISK_RESERVATION_NOFAIL = 1 << 0, + BCH_DISK_RESERVATION_PARTIAL = 1 << 1, +}; -int __bch2_disk_reservation_add(struct bch_fs *, - struct disk_reservation *, - u64, int); +int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *, + u64, enum bch_reservation_flags); static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, int flags) + u64 sectors, enum bch_reservation_flags flags) { #ifdef __KERNEL__ u64 old, new; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index cbfd88f98472..2182b555c112 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -225,6 +225,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); opt_set(thr->opts, read_only, 1); + opt_set(thr->opts, ratelimit_errors, 0); /* We need request_key() to be called before we punt to kthread: */ opt_set(thr->opts, nostart, true); diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c index 4f06cd8bbbe1..e86d36d23e9e 100644 --- a/fs/bcachefs/darray.c +++ b/fs/bcachefs/darray.c @@ -2,6 +2,7 @@ #include <linux/log2.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include "darray.h" int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) @@ -9,7 +10,19 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_ if (new_size > d->size) { new_size = roundup_pow_of_two(new_size); - void *data = kvmalloc_array_noprof(new_size, element_size, gfp); + /* + * This is a workaround: kvmalloc() doesn't support > INT_MAX + * allocations, but vmalloc() does. + * The limit needs to be lifted from kvmalloc, and when it does + * we'll go back to just using that. + */ + size_t bytes; + if (unlikely(check_mul_overflow(new_size, element_size, &bytes))) + return -ENOMEM; + + void *data = likely(bytes < INT_MAX) + ? kvmalloc_noprof(bytes, gfp) + : vmalloc_noprof(bytes); if (!data) return -ENOMEM; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 462b1a2fe1ad..8e75a852b358 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -80,6 +80,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc if (ptr2 == ptr) break; + ca = bch2_dev_have_ref(c, ptr2->dev); bucket = PTR_BUCKET_POS(ca, ptr2); bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); } @@ -235,7 +236,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (((1U << i) & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) { - bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), ptr); rewrites_found |= 1U << i; } i++; @@ -283,7 +285,8 @@ restart_drop_extra_replicas: durability - ptr_durability >= m->op.opts.data_replicas) { durability -= ptr_durability; - bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), &entry->ptr); goto restart_drop_extra_replicas; } } @@ -294,7 +297,7 @@ restart_drop_extra_replicas: bch2_extent_ptr_decoded_append(insert, &p); bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize(c, bkey_i_to_s(insert)); + bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert)); ret = bch2_sum_sector_overwrites(trans, &iter, insert, &should_check_enospc, @@ -557,7 +560,8 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - struct data_update_opts data_opts) + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; struct bkey_i *n; @@ -568,11 +572,11 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, if (ret) return ret; - while (data_opts.kill_ptrs) { - unsigned i = 0, drop = __fls(data_opts.kill_ptrs); + while (data_opts->kill_ptrs) { + unsigned i = 0, drop = __fls(data_opts->kill_ptrs); bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); - data_opts.kill_ptrs ^= 1U << drop; + data_opts->kill_ptrs ^= 1U << drop; } /* @@ -580,7 +584,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, * will do the appropriate thing with it (turning it into a * KEY_TYPE_error key, or just a discard if it was a cached extent) */ - bch2_extent_normalize(c, bkey_i_to_s(n)); + bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n)); /* * Since we're not inserting through an extent iterator @@ -719,7 +723,7 @@ int bch2_data_update_init(struct btree_trans *trans, m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); + ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); goto out; } diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 8d36365bdea8..e4b50723428e 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -40,7 +40,8 @@ void bch2_data_update_read_done(struct data_update *, int bch2_extent_drop_ptrs(struct btree_trans *, struct btree_iter *, struct bkey_s_c, - struct data_update_opts); + struct bch_io_opts *, + struct data_update_opts *); void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *, diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 84dd4a879d98..faffc98d5605 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -250,13 +250,6 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, return ret; } -static void dirent_copy_target(struct bkey_i_dirent *dst, - struct bkey_s_c_dirent src) -{ - dst->v.d_inum = src.v->d_inum; - dst->v.d_type = src.v->d_type; -} - int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, struct bkey_s_c_dirent d, subvol_inum *target) { diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 8945145865c5..53ad99666022 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -34,6 +34,13 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); +static inline void dirent_copy_target(struct bkey_i_dirent *dst, + struct bkey_s_c_dirent src) +{ + dst->v.d_inum = src.v->d_inum; + dst->v.d_type = src.v->d_type; +} + int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 9f3133e3e7e5..07eb8fa1b026 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -242,6 +242,14 @@ void bch2_accounting_swab(struct bkey_s k) *p = swab64(*p); } +static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, + struct disk_accounting_pos acc) +{ + unsafe_memcpy(r, &acc.replicas, + replicas_entry_bytes(&acc.replicas), + "variable length struct"); +} + static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) { struct disk_accounting_pos acc_k; @@ -249,9 +257,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc switch (acc_k.type) { case BCH_DISK_ACCOUNTING_replicas: - unsafe_memcpy(r, &acc_k.replicas, - replicas_entry_bytes(&acc_k.replicas), - "variable length struct"); + __accounting_to_replicas(r, acc_k); return true; default: return false; @@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) return ret; } +static int bch2_disk_accounting_validate_late(struct btree_trans *trans, + struct disk_accounting_pos acc, + u64 *v, unsigned nr) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0, invalid_dev = -1; + + switch (acc.type) { + case BCH_DISK_ACCOUNTING_replicas: { + struct bch_replicas_padded r; + __accounting_to_replicas(&r.e, acc); + + for (unsigned i = 0; i < r.e.nr_devs; i++) + if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_dev_exists(c, r.e.devs[i])) { + invalid_dev = r.e.devs[i]; + goto invalid_device; + } + + /* + * All replicas entry checks except for invalid device are done + * in bch2_accounting_validate + */ + BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); + + if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), + trans, accounting_replicas_not_marked, + "accounting not marked in superblock replicas\n %s", + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + /* + * We're not RW yet and still single threaded, dropping + * and retaking lock is ok: + */ + percpu_up_write(&c->mark_lock); + ret = bch2_mark_replicas(c, &r.e); + if (ret) + goto fsck_err; + percpu_down_write(&c->mark_lock); + } + break; + } + + case BCH_DISK_ACCOUNTING_dev_data_type: + if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { + invalid_dev = acc.dev_data_type.dev; + goto invalid_device; + } + break; + } + +fsck_err: + printbuf_exit(&buf); + return ret; +invalid_device: + if (fsck_err(trans, accounting_to_invalid_device, + "accounting entry points to invalid device %i\n %s", + invalid_dev, + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + for (unsigned i = 0; i < nr; i++) + v[i] = -v[i]; + + ret = commit_do(trans, NULL, NULL, 0, + bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: + -BCH_ERR_remove_disk_accounting_entry; + } else { + ret = -BCH_ERR_remove_disk_accounting_entry; + } + goto fsck_err; +} + /* * At startup time, initialize the in memory accounting from the btree (and * journal) @@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c) } keys->gap = keys->nr = dst - keys->data; - percpu_down_read(&c->mark_lock); - for (unsigned i = 0; i < acc->k.nr; i++) { - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + percpu_down_write(&c->mark_lock); + unsigned i = 0; + while (i < acc->k.nr) { + unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); - if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters)) - continue; + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); - struct bch_replicas_padded r; - if (!accounting_to_replicas(&r.e, acc->k.data[i].pos)) - continue; + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); /* - * If the replicas entry is invalid it'll get cleaned up by - * check_allocations: + * If the entry counters are zeroed, it should be treated as + * nonexistent - it might point to an invalid device. + * + * Remove it, so that if it's re-added it gets re-marked in the + * superblock: */ - if (bch2_replicas_entry_validate(&r.e, c, &buf)) + ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) + ? -BCH_ERR_remove_disk_accounting_entry + : bch2_disk_accounting_validate_late(trans, acc_k, + v, acc->k.data[idx].nr_counters); + + if (ret == -BCH_ERR_remove_disk_accounting_entry) { + free_percpu(acc->k.data[idx].v[0]); + free_percpu(acc->k.data[idx].v[1]); + darray_remove_item(&acc->k, &acc->k.data[idx]); + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + ret = 0; continue; - - struct disk_accounting_pos k; - bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); - - if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), - trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n %s", - (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &k), - buf.buf))) { - /* - * We're not RW yet and still single threaded, dropping - * and retaking lock is ok: - */ - percpu_up_read(&c->mark_lock); - ret = bch2_mark_replicas(c, &r.e); - if (ret) - goto fsck_err; - percpu_down_read(&c->mark_lock); } + + if (ret) + goto fsck_err; + i++; } preempt_disable(); @@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c) } preempt_enable(); fsck_err: - percpu_up_read(&c->mark_lock); + percpu_up_write(&c->mark_lock); err: printbuf_exit(&buf); bch2_trans_put(trans); @@ -777,8 +856,10 @@ int bch2_dev_usage_init(struct bch_dev *ca, bool gc) }; u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; - int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc)); + int ret = bch2_trans_do(c, ({ + bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?: + (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); + })); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 1587c6e1866a..749dcf368841 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -124,6 +124,11 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, "incorrect value size (%zu < %u)", bkey_val_u64s(k.k), stripe_val_u64s(s)); + bkey_fsck_err_on(s->csum_granularity_bits >= 64, + c, stripe_csum_granularity_bad, + "invalid csum granularity (%u >= 64)", + s->csum_granularity_bits); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; @@ -145,7 +150,11 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, nr_data, s.nr_redundant); bch2_prt_csum_type(out, s.csum_type); - prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); + prt_str(out, " gran "); + if (s.csum_granularity_bits < 64) + prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits); + else + prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits); if (s.disk_label) { prt_str(out, " label"); @@ -257,12 +266,12 @@ static int __mark_stripe_bucket(struct btree_trans *trans, if (!deleting) { a->stripe = s.k->p.offset; a->stripe_redundancy = s.v->nr_redundant; + alloc_data_type_set(a, data_type); } else { a->stripe = 0; a->stripe_redundancy = 0; + alloc_data_type_set(a, BCH_DATA_user); } - - alloc_data_type_set(a, data_type); err: printbuf_exit(&buf); return ret; @@ -1177,7 +1186,7 @@ static void ec_stripe_delete_work(struct work_struct *work) if (!idx) break; - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ec_stripe_delete(trans, idx)); bch_err_fn(c, ret); if (ret) @@ -1197,47 +1206,62 @@ void bch2_do_stripe_deletes(struct bch_fs *c) /* stripe creation: */ static int ec_stripe_key_update(struct btree_trans *trans, - struct bkey_i_stripe *new, - bool create) + struct bkey_i_stripe *old, + struct bkey_i_stripe *new) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; + bool create = !old; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_intent); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, + new->k.p, BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) goto err; - if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { - bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", - create ? "creating" : "updating", - bch2_bkey_types[k.k->type]); + if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), + c, "error %s stripe: got existing key type %s", + create ? "creating" : "updating", + bch2_bkey_types[k.k->type])) { ret = -EINVAL; goto err; } if (k.k->type == KEY_TYPE_stripe) { - const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; - unsigned i; + const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; - if (old->nr_blocks != new->v.nr_blocks) { - bch_err(c, "error updating stripe: nr_blocks does not match"); - ret = -EINVAL; - goto err; - } + BUG_ON(old->v.nr_blocks != new->v.nr_blocks); + BUG_ON(old->v.nr_blocks != v->nr_blocks); - for (i = 0; i < new->v.nr_blocks; i++) { - unsigned v = stripe_blockcount_get(old, i); + for (unsigned i = 0; i < new->v.nr_blocks; i++) { + unsigned sectors = stripe_blockcount_get(v, i); - BUG_ON(v && - (old->ptrs[i].dev != new->v.ptrs[i].dev || - old->ptrs[i].gen != new->v.ptrs[i].gen || - old->ptrs[i].offset != new->v.ptrs[i].offset)); + if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { + struct printbuf buf = PRINTBUF; - stripe_blockcount_set(&new->v, i, v); + prt_printf(&buf, "stripe changed nonempty block %u", i); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + ret = -EINVAL; + goto err; + } + + /* + * If the stripe ptr changed underneath us, it must have + * been dev_remove_stripes() -> * invalidate_stripe_to_dev() + */ + if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) { + BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID); + + if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i])) + new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID; + } + + stripe_blockcount_set(&new->v, i, sectors); } } @@ -1495,12 +1519,14 @@ static void ec_stripe_create(struct ec_stripe_new *s) goto err; } - ret = bch2_trans_do(c, &s->res, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, - ec_stripe_key_update(trans, - bkey_i_to_stripe(&s->new_stripe.key), - !s->have_existing_stripe)); + ret = bch2_trans_commit_do(c, &s->res, NULL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, + ec_stripe_key_update(trans, + s->have_existing_stripe + ? bkey_i_to_stripe(&s->existing_stripe.key) + : NULL, + bkey_i_to_stripe(&s->new_stripe.key))); bch_err_msg(c, ret, "creating stripe key"); if (ret) { goto err; @@ -1844,6 +1870,10 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, } h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); + if (!h) { + h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc); + goto err; + } found: if (h->rw_devs_change_count != c->rw_devs_change_count) ec_stripe_head_devs_update(c, h); @@ -1876,7 +1906,15 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { - __clear_bit(v->ptrs[i].dev, devs.d); + /* + * Note: we don't yet repair invalid blocks (failed/removed + * devices) when reusing stripes - we still need a codepath to + * walk backpointers and update all extents that point to that + * block when updating the stripe + */ + if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) + __clear_bit(v->ptrs[i].dev, devs.d); + if (i < h->s->nr_data) nr_have_data++; else diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 60b7875adada..9c4fe5cdbfb7 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -83,6 +83,8 @@ x(ENOMEM, ENOMEM_fs_other_alloc) \ x(ENOMEM, ENOMEM_dev_alloc) \ x(ENOMEM, ENOMEM_disk_accounting) \ + x(ENOMEM, ENOMEM_stripe_head_alloc) \ + x(ENOMEM, ENOMEM_journal_read_bucket) \ x(ENOSPC, ENOSPC_disk_reservation) \ x(ENOSPC, ENOSPC_bucket_alloc) \ x(ENOSPC, ENOSPC_disk_label_add) \ @@ -222,6 +224,7 @@ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \ x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ x(BCH_ERR_invalid_sb, invalid_sb_members) \ x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ @@ -268,7 +271,8 @@ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ x(0, invalid_snapshot_node) \ - x(0, option_needs_open_fs) + x(0, option_needs_open_fs) \ + x(0, remove_disk_accounting_entry) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 7a79f695ba2e..b679def8fb98 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -251,7 +251,10 @@ int __bch2_fsck_err(struct bch_fs *c, * delete the key) * - and we don't need to warn if we're not prompting */ - WARN_ON(!(flags & FSCK_AUTOFIX) && !trans && bch2_current_has_btree_trans(c)); + WARN_ON((flags & FSCK_CAN_FIX) && + !(flags & FSCK_AUTOFIX) && + !trans && + bch2_current_has_btree_trans(c)); if ((flags & FSCK_CAN_FIX) && test_bit(err, c->sb.errors_silent)) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index cc0d22085aef..37e3d69bec06 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -978,31 +978,54 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke return NULL; } -void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) +static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, + struct bch_extent_ptr *ptr) +{ + if (!opts->promote_target || + !bch2_dev_in_target(c, ptr->dev, opts->promote_target)) + return false; + + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + + return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); +} + +void bch2_extent_ptr_set_cached(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s k, + struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; - union bch_extent_entry *ec = NULL; + struct extent_ptr_decoded p; - bkey_extent_entry_for_each(ptrs, entry) { + rcu_read_lock(); + if (!want_cached_ptr(c, opts, ptr)) { + bch2_bkey_drop_ptr_noerror(k, ptr); + goto out; + } + + /* + * Stripes can't contain cached data, for - reasons. + * + * Possibly something we can fix in the future? + */ + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (&entry->ptr == ptr) { - ptr->cached = true; - if (ec) - extent_entry_drop(k, ec); - return; + if (p.has_ec) + bch2_bkey_drop_ptr_noerror(k, ptr); + else + ptr->cached = true; + goto out; } - if (extent_entry_is_stripe_ptr(entry)) - ec = entry; - else if (extent_entry_is_ptr(entry)) - ec = NULL; - } - BUG(); +out: + rcu_read_unlock(); } /* - * bch_extent_normalize - clean up an extent, dropping stale pointers etc. + * bch2_extent_normalize - clean up an extent, dropping stale pointers etc. * * Returns true if @k should be dropped entirely * @@ -1016,8 +1039,39 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) rcu_read_lock(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && - (ca = bch2_dev_rcu(c, ptr->dev)) && - dev_ptr_stale_rcu(ca, ptr) > 0); + (!(ca = bch2_dev_rcu(c, ptr->dev)) || + dev_ptr_stale_rcu(ca, ptr) > 0)); + rcu_read_unlock(); + + return bkey_deleted(k.k); +} + +/* + * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc. + * + * Like bch2_extent_normalize(), but also only keeps a single cached pointer on + * the promote target. + */ +bool bch2_extent_normalize_by_opts(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s k) +{ + struct bkey_ptrs ptrs; + bool have_cached_ptr; + + rcu_read_lock(); +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(k); + have_cached_ptr = false; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->cached) { + if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) { + bch2_bkey_drop_ptr(k, ptr); + goto restart_drop_ptrs; + } + have_cached_ptr = true; + } rcu_read_unlock(); return bkey_deleted(k.k); @@ -1310,7 +1364,7 @@ void bch2_ptr_swab(struct bkey_s k) for (entry = ptrs.start; entry < ptrs.end; entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { + switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: break; case BCH_EXTENT_ENTRY_crc32: @@ -1330,6 +1384,9 @@ void bch2_ptr_swab(struct bkey_s k) break; case BCH_EXTENT_ENTRY_rebalance: break; + default: + /* Bad entry type: will be caught by validate() */ + return; } } } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index ed5001dd662e..bcffcf60aaaf 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -686,15 +686,28 @@ bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); struct bch_extent_ptr * bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); -void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); +void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *, + struct bkey_s, struct bch_extent_ptr *); +bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); + void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, + struct bch_extent_ptr ptr2) +{ + return (ptr1.cached == ptr2.cached && + ptr1.unwritten == ptr2.unwritten && + ptr1.offset == ptr2.offset && + ptr1.dev == ptr2.dev && + ptr1.dev == ptr2.dev); +} + void bch2_ptr_swab(struct bkey_s); const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 48a1ab9a649b..95972809e76d 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -856,6 +856,12 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, folios_trunc(&fs, fi); end = min(end, folio_end_pos(darray_last(fs))); } else { + if (!folio_test_uptodate(f)) { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + folios_trunc(&fs, fi + 1); end = f_pos + f_reserved; } diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index ee1c0325f313..6d3a05ae5da8 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -369,6 +369,7 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio) static __always_inline long bch2_dio_write_done(struct dio_write *dio) { + struct bch_fs *c = dio->op.c; struct kiocb *req = dio->req; struct bch_inode_info *inode = dio->inode; bool sync = dio->sync; @@ -387,7 +388,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); - bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write); + bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index af3a24546aa3..1d4910ea0f1d 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -399,14 +399,17 @@ void bch2_folio_reservation_put(struct bch_fs *c, bch2_quota_reservation_put(c, inode, &res->quota); } -int bch2_folio_reservation_get(struct bch_fs *c, +static int __bch2_folio_reservation_get(struct bch_fs *c, struct bch_inode_info *inode, struct folio *folio, struct bch2_folio_reservation *res, - size_t offset, size_t len) + size_t offset, size_t len, + bool partial) { struct bch_folio *s = bch2_folio_create(folio, 0); unsigned i, disk_sectors = 0, quota_sectors = 0; + struct disk_reservation disk_res = {}; + size_t reserved = len; int ret; if (!s) @@ -422,48 +425,65 @@ int bch2_folio_reservation_get(struct bch_fs *c, } if (disk_sectors) { - ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); + ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors, + partial ? BCH_DISK_RESERVATION_PARTIAL : 0); if (unlikely(ret)) return ret; + + if (unlikely(disk_res.sectors != disk_sectors)) { + disk_sectors = quota_sectors = 0; + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); + if (disk_sectors > disk_res.sectors) { + /* + * Make sure to get a reservation that's + * aligned to the filesystem blocksize: + */ + unsigned reserved_offset = round_down(i << 9, block_bytes(c)); + reserved = clamp(reserved_offset, offset, offset + len) - offset; + + if (!reserved) { + bch2_disk_reservation_put(c, &disk_res); + return -BCH_ERR_ENOSPC_disk_reservation; + } + break; + } + quota_sectors += s->s[i].state == SECTOR_unallocated; + } + } } if (quota_sectors) { ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true); if (unlikely(ret)) { - struct disk_reservation tmp = { .sectors = disk_sectors }; - - bch2_disk_reservation_put(c, &tmp); - res->disk.sectors -= disk_sectors; + bch2_disk_reservation_put(c, &disk_res); return ret; } } - return 0; + res->disk.sectors += disk_res.sectors; + return partial ? reserved : 0; } -ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c, +int bch2_folio_reservation_get(struct bch_fs *c, struct bch_inode_info *inode, struct folio *folio, struct bch2_folio_reservation *res, size_t offset, size_t len) { - size_t l, reserved = 0; - int ret; - - while ((l = len - reserved)) { - while ((ret = bch2_folio_reservation_get(c, inode, folio, res, offset, l))) { - if ((offset & (block_bytes(c) - 1)) + l <= block_bytes(c)) - return reserved ?: ret; - - len = reserved + l; - l /= 2; - } - - offset += l; - reserved += l; - } + return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false); +} - return reserved; +ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + size_t offset, size_t len) +{ + return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true); } static void bch2_clear_folio_bits(struct folio *folio) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 71d0fa387509..2456c41b215e 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -182,7 +182,7 @@ static int bch2_flush_inode(struct bch_fs *c, struct bch_inode_unpacked u; int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: - bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: + bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?: bch2_inode_flush_nocow_writes(c, inode); bch2_write_ref_put(c, BCH_WRITE_REF_fsync); return ret; @@ -587,7 +587,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, POS(inode->v.i_ino, start_sector), BTREE_ITER_slots|BTREE_ITER_intent); - while (!ret && bkey_lt(iter.pos, end_pos)) { + while (!ret) { s64 i_sectors_delta = 0; struct quota_res quota_res = { 0 }; struct bkey_s_c k; @@ -598,6 +598,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_begin(trans); + if (bkey_ge(iter.pos, end_pos)) + break; + ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); if (ret) @@ -634,12 +637,15 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, - opts.data_replicas, true)) + opts.data_replicas, true)) { ret = drop_locks_do(trans, (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, opts.data_replicas, false), 0)); + if (ret) + goto bkey_err; + } bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); if (ret) @@ -667,10 +673,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); if (bch2_mark_pagecache_reserved(inode, &hole_start, - iter.pos.offset, true)) - drop_locks_do(trans, + iter.pos.offset, true)) { + ret = drop_locks_do(trans, bch2_mark_pagecache_reserved(inode, &hole_start, iter.pos.offset, false)); + if (ret) + goto bkey_err; + } bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 02969dff165d..a41d0d8a2f7b 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) return a.subvol == b.subvol && a.inum == b.inum; } +static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) +{ + const subvol_inum *inum = data; + + return jhash(&inum->inum, sizeof(inum->inum), seed); +} + +static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) +{ + const struct bch_inode_info *inode = data; + + return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); +} + static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -170,11 +184,91 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { .head_offset = offsetof(struct bch_inode_info, hash), .key_offset = offsetof(struct bch_inode_info, ei_inum), .key_len = sizeof(subvol_inum), + .hashfn = bch2_vfs_inode_hash_fn, + .obj_hashfn = bch2_vfs_inode_obj_hash_fn, .obj_cmpfn = bch2_vfs_inode_cmp_fn, .automatic_shrinking = true, }; -struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) +{ + struct bch_fs *c = trans->c; + struct rhashtable *ht = &c->vfs_inodes_table; + subvol_inum inum = (subvol_inum) { .inum = p.offset }; + DARRAY(u32) subvols; + int ret = 0; + + if (!test_bit(BCH_FS_started, &c->flags)) + return false; + + darray_init(&subvols); +restart_from_top: + + /* + * Tweaked version of __rhashtable_lookup(); we need to get a list of + * subvolumes in which the given inode number is open. + * + * For this to work, we don't include the subvolume ID in the key that + * we hash - all inodes with the same inode number regardless of + * subvolume will hash to the same slot. + * + * This will be less than ideal if the same file is ever open + * simultaneously in many different snapshots: + */ + rcu_read_lock(); + struct rhash_lock_head __rcu *const *bkt; + struct rhash_head *he; + unsigned int hash; + struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); +restart: + hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); + bkt = rht_bucket(tbl, hash); + do { + struct bch_inode_info *inode; + + rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { + if (inode->ei_inum.inum == inum.inum) { + ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, + GFP_NOWAIT|__GFP_NOWARN); + if (ret) { + rcu_read_unlock(); + ret = darray_make_room(&subvols, 1); + if (ret) + goto err; + subvols.nr = 0; + goto restart_from_top; + } + } + } + /* An object might have been moved to a different hash chain, + * while we walk along it - better check and retry. + */ + } while (he != RHT_NULLS_MARKER(bkt)); + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) + goto restart; + rcu_read_unlock(); + + darray_for_each(subvols, i) { + u32 snap; + ret = bch2_subvolume_get_snapshot(trans, *i, &snap); + if (ret) + goto err; + + ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); + if (ret) + break; + } +err: + darray_exit(&subvols); + return ret; +} + +static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) { return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); } @@ -184,7 +278,8 @@ static void __wait_on_freeing_inode(struct bch_fs *c, subvol_inum inum) { wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + struct wait_bit_queue_entry wait; + wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->v.i_lock); @@ -252,7 +347,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, set_bit(EI_INODE_HASHED, &inode->ei_flags); retry: - if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, + if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, + &inode->ei_inum, &inode->hash, bch2_vfs_inodes_params))) { old = bch2_inode_hash_find(c, trans, inode->ei_inum); @@ -560,7 +656,7 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); struct bch_inode_info *inode; - bch2_trans_do(c, NULL, NULL, 0, + bch2_trans_do(c, PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), &hash, &dentry->d_name))); if (IS_ERR(inode)) @@ -773,7 +869,7 @@ static int bch2_rename2(struct mnt_idmap *idmap, ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); if (ret) - goto err; + goto err_tx_restart; if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { ret = bch2_fs_quota_transfer(c, src_inode, @@ -1170,7 +1266,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(ei->v.i_ino, start), 0); - while (true) { + while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1178,14 +1274,14 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, u32 snapshot; ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); if (ret) - goto err; + continue; bch2_btree_iter_set_snapshot(&iter, snapshot); k = bch2_btree_iter_peek_upto(&iter, end); ret = bkey_err(k); if (ret) - goto err; + continue; if (!k.k) break; @@ -1205,7 +1301,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &cur); if (ret) - break; + continue; k = bkey_i_to_s_c(cur.k); bch2_bkey_buf_realloc(&prev, c, k.k->u64s); @@ -1233,10 +1329,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); -err: - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; } bch2_trans_iter_exit(trans, &iter); @@ -1944,7 +2036,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); printbuf_nul_terminate(&buf); - seq_puts(seq, buf.buf); + seq_printf(seq, ",%s", buf.buf); int ret = buf.allocation_failure ? -ENOMEM : 0; printbuf_exit(&buf); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index da74ecc236e7..59f9f7ae728d 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -54,8 +54,6 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) return inode->ei_inum; } -struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum); - /* * Set if we've gotten a btree error for this inode, and thus the vfs inode and * btree inode may be inconsistent: @@ -148,6 +146,8 @@ struct bch_inode_info * __bch2_create(struct mnt_idmap *, struct bch_inode_info *, struct dentry *, umode_t, dev_t, subvol_inum, unsigned); +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p); + int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, struct bch_qid, @@ -198,10 +198,7 @@ int bch2_vfs_init(void); #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) -static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) -{ - return NULL; -} +static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; } static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index b8a6ceb0cc7a..75c8a97a6954 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -326,17 +326,54 @@ err: return ret; } +static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) +{ + if (inode->bi_inum == BCACHEFS_ROOT_INO && + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) + return false; + + return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); +} + +static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, + SPOS(d_pos.inode, d_pos.offset, snapshot), + BTREE_ITER_intent| + BTREE_ITER_with_updates); + int ret = bkey_err(k); + if (ret) + return ret; + + if (bpos_eq(k.k->p, d_pos)) { + /* + * delet_at() doesn't work because the update path doesn't + * internally use BTREE_ITER_with_updates yet + */ + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + bkey_init(&k->k); + k->k.type = KEY_TYPE_whiteout; + k->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) { struct bch_fs *c = trans->c; - struct bch_hash_info dir_hash; struct bch_inode_unpacked lostfound; char name_buf[20]; - struct qstr name; - u64 dir_offset = 0; - u32 dirent_snapshot = inode->bi_snapshot; int ret; + u32 dirent_snapshot = inode->bi_snapshot; if (inode->bi_subvol) { inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; @@ -367,9 +404,10 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (ret) return ret; - dir_hash = bch2_hash_info_init(c, &lostfound); + struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); + struct qstr name = (struct qstr) QSTR(name_buf); - name = (struct qstr) QSTR(name_buf); + inode->bi_dir = lostfound.bi_inum; ret = bch2_dirent_create_snapshot(trans, inode->bi_parent_subvol, lostfound.bi_inum, @@ -378,17 +416,70 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * inode_d_type(inode), &name, inode->bi_subvol ?: inode->bi_inum, - &dir_offset, + &inode->bi_dir_offset, STR_HASH_must_create); if (ret) { bch_err_msg(c, ret, "error creating dirent"); return ret; } - inode->bi_dir = lostfound.bi_inum; - inode->bi_dir_offset = dir_offset; + ret = __bch2_fsck_write_inode(trans, inode); + if (ret) + return ret; + + /* + * Fix up inodes in child snapshots: if they should also be reattached + * update the backpointer field, if they should not be we need to emit + * whiteouts for the dirent we just created. + */ + if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { + snapshot_id_list whiteouts_done; + struct btree_iter iter; + struct bkey_s_c k; + + darray_init(&whiteouts_done); + + for_each_btree_key_reverse_norestart(trans, iter, + BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), + BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { + if (k.k->p.offset != inode->bi_inum) + break; + + if (!bkey_is_inode(k.k) || + !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || + snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) + continue; + + struct bch_inode_unpacked child_inode; + bch2_inode_unpack(k, &child_inode); - return __bch2_fsck_write_inode(trans, inode); + if (!inode_should_reattach(&child_inode)) { + ret = maybe_delete_dirent(trans, + SPOS(lostfound.bi_inum, inode->bi_dir_offset, + dirent_snapshot), + k.k->p.snapshot); + if (ret) + break; + + ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); + if (ret) + break; + } else { + iter.snapshot = k.k->p.snapshot; + child_inode.bi_dir = inode->bi_dir; + child_inode.bi_dir_offset = inode->bi_dir_offset; + + ret = bch2_inode_write_flags(trans, &iter, &child_inode, + BTREE_UPDATE_internal_snapshot_node); + if (ret) + break; + } + } + darray_exit(&whiteouts_done); + bch2_trans_iter_exit(trans, &iter); + } + + return ret; } static int remove_backpointer(struct btree_trans *trans, @@ -838,35 +929,138 @@ static int get_visible_inodes(struct btree_trans *trans, return ret; } -static int hash_redo_key(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c k) +static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) { - struct bkey_i *delete; - struct bkey_i *tmp; + if (d.v->d_type == DT_SUBVOL) { + u32 snap; + u64 inum; + int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + return !ret; + } else { + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); + int ret = bkey_err(k); + if (ret) + return ret; - delete = bch2_trans_kmalloc(trans, sizeof(*delete)); - if (IS_ERR(delete)) - return PTR_ERR(delete); + ret = bkey_is_inode(k.k); + bch2_trans_iter_exit(trans, &iter); + return ret; + } +} - tmp = bch2_bkey_make_mut_noupdate(trans, k); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); +/* + * Prefer to delete the first one, since that will be the one at the wrong + * offset: + * return value: 0 -> delete k1, 1 -> delete k2 + */ +static int hash_pick_winner(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c k1, + struct bkey_s_c k2) +{ + if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && + !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) + return 0; - bkey_init(&delete->k); - delete->k.p = k_iter->pos; - return bch2_btree_iter_traverse(k_iter) ?: - bch2_trans_update(trans, k_iter, delete, 0) ?: - bch2_hash_set_in_snapshot(trans, desc, hash_info, - (subvol_inum) { 0, k.k->p.inode }, - k.k->p.snapshot, tmp, - STR_HASH_must_create| - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + switch (desc.btree_id) { + case BTREE_ID_dirents: { + int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1)); + if (ret < 0) + return ret; + if (!ret) + return 0; + + ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2)); + if (ret < 0) + return ret; + if (!ret) + return 1; + return 2; + } + default: + return 0; + } +} + +static int fsck_update_backpointers(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_i *new) +{ + if (new->k.type != KEY_TYPE_dirent) + return 0; + + struct bkey_i_dirent *d = bkey_i_to_dirent(new); + struct inode_walker target = inode_walker_init(); + int ret = 0; + + if (d->v.d_type == DT_SUBVOL) { + BUG(); + } else { + ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); + if (ret) + goto err; + + darray_for_each(target.inodes, i) { + i->inode.bi_dir_offset = d->k.p.offset; + ret = __bch2_fsck_write_inode(trans, &i->inode); + if (ret) + goto err; + } + } +err: + inode_walker_exit(&target); + return ret; +} + +static int fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old) +{ + struct qstr old_name = bch2_dirent_get_name(old); + struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bkey_dirent_init(&new->k_i); + dirent_copy_target(new, old); + new->k.p = old.k->p; + + for (unsigned i = 0; i < 1000; i++) { + unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", + old_name.len, old_name.name, i); + unsigned u64s = BKEY_U64s + dirent_val_u64s(len); + + if (u64s > U8_MAX) + return -EINVAL; + + new->k.u64s = u64s; + + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + (subvol_inum) { 0, old.k->p.inode }, + old.k->p.snapshot, &new->k_i, + BTREE_UPDATE_internal_snapshot_node); + if (!bch2_err_matches(ret, EEXIST)) + break; + } + + if (ret) + return ret; + + return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); } static int hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, const struct bch_hash_desc desc, struct bch_hash_info *hash_info, struct btree_iter *k_iter, struct bkey_s_c hash_k) @@ -895,16 +1089,9 @@ static int hash_check_key(struct btree_trans *trans, if (bkey_eq(k.k->p, hash_k.k->p)) break; - if (fsck_err_on(k.k->type == desc.key_type && - !desc.cmp_bkey(k, hash_k), - trans, hash_table_key_duplicate, - "duplicate hash table keys:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), - buf.buf))) { - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; - break; - } + if (k.k->type == desc.key_type && + !desc.cmp_bkey(k, hash_k)) + goto duplicate_entries; if (bkey_deleted(k.k)) { bch2_trans_iter_exit(trans, &iter); @@ -917,18 +1104,66 @@ out: return ret; bad_hash: if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", + "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { - ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); - bch_err_fn(c, ret); + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); + if (IS_ERR(new)) + return PTR_ERR(new); + + k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, + (subvol_inum) { 0, hash_k.k->p.inode }, + hash_k.k->p.snapshot, new, + STR_HASH_must_create| + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node); + ret = bkey_err(k); if (ret) - return ret; - ret = -BCH_ERR_transaction_restart_nested; + goto out; + if (k.k) + goto duplicate_entries; + + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, + BTREE_UPDATE_internal_snapshot_node) ?: + fsck_update_backpointers(trans, s, desc, hash_info, new) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + goto out; } fsck_err: goto out; +duplicate_entries: + ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); + if (ret < 0) + goto out; + + if (!fsck_err(trans, hash_table_key_duplicate, + "duplicate hash table keys%s:\n%s", + ret != 2 ? "" : ", both point to valid inodes", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) + goto out; + + switch (ret) { + case 0: + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + break; + case 1: + ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); + break; + case 2: + ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: + bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + goto out; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; } static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, @@ -994,7 +1229,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans, */ inode->bi_dir = 0; inode->bi_dir_offset = 0; - inode->bi_flags &= ~BCH_INODE_backptr_untrusted; *write_inode = true; } @@ -1006,28 +1240,37 @@ fsck_err: return ret; } -static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) +static int get_snapshot_root_inode(struct btree_trans *trans, + struct bch_inode_unpacked *root, + u64 inum) { - subvol_inum inum = { - .subvol = snapshot_t(c, p.snapshot)->subvol, - .inum = p.offset, - }; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; - /* snapshot tree corruption, can't safely delete */ - if (!inum.subvol) { - bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot); - return true; + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) + goto found_root; } - - return __bch2_inode_hash_find(c, inum) != NULL; + if (ret) + goto err; + BUG(); +found_root: + BUG_ON(bch2_inode_unpack(k, root)); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - struct bch_inode_unpacked *prev, - struct snapshots_seen *s, - bool full) + struct bch_inode_unpacked *snapshot_root, + struct snapshots_seen *s) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; @@ -1050,22 +1293,19 @@ static int check_inode(struct btree_trans *trans, BUG_ON(bch2_inode_unpack(k, &u)); - if (!full && - !(u.bi_flags & (BCH_INODE_i_size_dirty| - BCH_INODE_i_sectors_dirty| - BCH_INODE_unlinked))) - return 0; - - if (prev->bi_inum != u.bi_inum) - *prev = u; + if (snapshot_root->bi_inum != u.bi_inum) { + ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); + if (ret) + goto err; + } - if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || - inode_d_type(prev) != inode_d_type(&u), + if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), trans, inode_snapshot_mismatch, "inodes in different snapshots don't match")) { - bch_err(c, "repair not implemented yet"); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto err_noprint; + u.bi_hash_seed = snapshot_root->bi_hash_seed; + SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); + do_update = true; } if (u.bi_dir || u.bi_dir_offset) { @@ -1101,28 +1341,27 @@ static int check_inode(struct btree_trans *trans, ret = 0; } - if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && - bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { - struct bpos new_min_pos; - - ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); - if (ret) - goto err; - - u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; - - ret = __bch2_fsck_write_inode(trans, &u); + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto err; - bch_err_msg(c, ret, "in fsck updating inode"); + if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), + trans, inode_has_child_snapshots_wrong, + "inode has_child_snapshots flag wrong (should be %u)\n%s", + ret, + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { if (ret) - goto err_noprint; - - if (!bpos_eq(new_min_pos, POS_MIN)) - bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); - goto err_noprint; + u.bi_flags |= BCH_INODE_has_child_snapshot; + else + u.bi_flags &= ~BCH_INODE_has_child_snapshot; + do_update = true; } + ret = 0; - if (u.bi_flags & BCH_INODE_unlinked) { + if ((u.bi_flags & BCH_INODE_unlinked) && + !(u.bi_flags & BCH_INODE_has_child_snapshot)) { if (!test_bit(BCH_FS_started, &c->flags)) { /* * If we're not in online fsck, don't delete unlinked @@ -1147,7 +1386,11 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; } else { - if (fsck_err_on(!bch2_inode_is_open(c, k.k->p), + ret = bch2_inode_or_descendents_is_open(trans, k.k->p); + if (ret < 0) + goto err; + + if (fsck_err_on(!ret, trans, inode_unlinked_and_not_open, "inode %llu%u unlinked and not open", u.bi_inum, u.bi_snapshot)) { @@ -1155,69 +1398,10 @@ static int check_inode(struct btree_trans *trans, bch_err_msg(c, ret, "in fsck deleting inode"); goto err_noprint; } + ret = 0; } } - /* i_size_dirty is vestigal, since we now have logged ops for truncate * */ - if (u.bi_flags & BCH_INODE_i_size_dirty && - (!test_bit(BCH_FS_clean_recovery, &c->flags) || - fsck_err(trans, inode_i_size_dirty_but_clean, - "filesystem marked clean, but inode %llu has i_size dirty", - u.bi_inum))) { - bch_verbose(c, "truncating inode %llu", u.bi_inum); - - /* - * XXX: need to truncate partial blocks too here - or ideally - * just switch units to bytes and that issue goes away - */ - ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, - iter->pos.snapshot), - POS(u.bi_inum, U64_MAX), - 0, NULL); - bch_err_msg(c, ret, "in fsck truncating inode"); - if (ret) - return ret; - - /* - * We truncated without our normal sector accounting hook, just - * make sure we recalculate it: - */ - u.bi_flags |= BCH_INODE_i_sectors_dirty; - - u.bi_flags &= ~BCH_INODE_i_size_dirty; - do_update = true; - } - - /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */ - if (u.bi_flags & BCH_INODE_i_sectors_dirty && - (!test_bit(BCH_FS_clean_recovery, &c->flags) || - fsck_err(trans, inode_i_sectors_dirty_but_clean, - "filesystem marked clean, but inode %llu has i_sectors dirty", - u.bi_inum))) { - s64 sectors; - - bch_verbose(c, "recounting sectors for inode %llu", - u.bi_inum); - - sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); - if (sectors < 0) { - bch_err_msg(c, sectors, "in fsck recounting inode sectors"); - return sectors; - } - - u.bi_sectors = sectors; - u.bi_flags &= ~BCH_INODE_i_sectors_dirty; - do_update = true; - } - - if (u.bi_flags & BCH_INODE_backptr_untrusted) { - u.bi_dir = 0; - u.bi_dir_offset = 0; - u.bi_flags &= ~BCH_INODE_backptr_untrusted; - do_update = true; - } - if (fsck_err_on(u.bi_parent_subvol && (u.bi_subvol == 0 || u.bi_subvol == BCACHEFS_ROOT_SUBVOL), @@ -1274,8 +1458,7 @@ err_noprint: int bch2_check_inodes(struct bch_fs *c) { - bool full = c->opts.fsck; - struct bch_inode_unpacked prev = { 0 }; + struct bch_inode_unpacked snapshot_root = {}; struct snapshots_seen s; snapshots_seen_init(&s); @@ -1285,13 +1468,104 @@ int bch2_check_inodes(struct bch_fs *c) POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_inode(trans, &iter, k, &prev, &s, full))); + check_inode(trans, &iter, k, &snapshot_root, &s))); snapshots_seen_exit(&s); bch_err_fn(c, ret); return ret; } +static int find_oldest_inode_needs_reattach(struct btree_trans *trans, + struct bch_inode_unpacked *inode) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + /* + * We look for inodes to reattach in natural key order, leaves first, + * but we should do the reattach at the oldest version that needs to be + * reattached: + */ + for_each_btree_key_norestart(trans, iter, + BTREE_ID_inodes, + SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode->bi_inum) + break; + + if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) + continue; + + if (!bkey_is_inode(k.k)) + break; + + struct bch_inode_unpacked parent_inode; + bch2_inode_unpack(k, &parent_inode); + + if (!inode_should_reattach(&parent_inode)) + break; + + *inode = parent_inode; + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int check_unreachable_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!bkey_is_inode(k.k)) + return 0; + + struct bch_inode_unpacked inode; + BUG_ON(bch2_inode_unpack(k, &inode)); + + if (!inode_should_reattach(&inode)) + return 0; + + ret = find_oldest_inode_needs_reattach(trans, &inode); + if (ret) + return ret; + + if (fsck_err(trans, inode_unreachable, + "unreachable inode:\n%s", + (bch2_inode_unpacked_to_text(&buf, &inode), + buf.buf))) + ret = reattach_inode(trans, &inode); +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * Reattach unreachable (but not unlinked) inodes + * + * Run after check_inodes() and check_dirents(), so we node that inode + * backpointer fields point to valid dirents, and every inode that has a dirent + * that points to it has its backpointer field set - so we're just looking for + * non-unlinked inodes without backpointers: + * + * XXX: this is racy w.r.t. hardlink removal in online fsck + */ +int bch2_check_unreachable_inodes(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_unreachable_inode(trans, &iter, k))); + bch_err_fn(c, ret); + return ret; +} + static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) { switch (btree) { @@ -1694,8 +1968,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) continue; - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && - k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && + if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n %s", @@ -2207,7 +2480,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; - ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); + ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) goto err; if (ret) { @@ -2321,7 +2594,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; - ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); + ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); bch_err_fn(c, ret); return ret; } @@ -2409,7 +2682,7 @@ fsck_err: /* Get root directory, create if it doesn't exist: */ int bch2_check_root(struct bch_fs *c) { - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_root_trans(trans)); bch_err_fn(c, ret); return ret; @@ -2450,22 +2723,6 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, if (ret) break; - /* - * We've checked that inode backpointers point to valid dirents; - * here, it's sufficient to check that the subvolume root has a - * dirent: - */ - if (fsck_err_on(!subvol_root.bi_dir, - trans, subvol_unreachable, - "unreachable subvolume %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), - prt_newline(&buf), - bch2_inode_unpacked_to_text(&buf, &subvol_root), - buf.buf))) { - ret = reattach_subvol(trans, s); - break; - } - u32 parent = le32_to_cpu(s.v->fs_path_parent); if (darray_u32_has(&subvol_path, parent)) { @@ -2526,12 +2783,6 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) return false; } -/* - * Check that a given inode is reachable from its subvolume root - we already - * verified subvolume connectivity: - * - * XXX: we should also be verifying that inodes are in the right subvolumes - */ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; @@ -2545,6 +2796,9 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino BUG_ON(bch2_inode_unpack(inode_k, &inode)); + if (!S_ISDIR(inode.bi_mode)) + return 0; + while (!inode.bi_subvol) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; @@ -2559,21 +2813,15 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino bch2_trans_iter_exit(trans, &dirent_iter); if (bch2_err_matches(ret, ENOENT)) { - ret = 0; - if (fsck_err(trans, inode_unreachable, - "unreachable inode\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, inode_k), - buf.buf))) - ret = reattach_inode(trans, &inode); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, inode_k); + bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", + bch2_err_str(ret), buf.buf); goto out; } bch2_trans_iter_exit(trans, &dirent_iter); - if (!S_ISDIR(inode.bi_mode)) - break; - ret = darray_push(p, ((struct pathbuf_entry) { .inum = inode.bi_inum, .snapshot = snapshot, @@ -2626,9 +2874,8 @@ fsck_err: } /* - * Check for unreachable inodes, as well as loops in the directory structure: - * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's - * unreachable: + * Check for loops in the directory structure: all other connectivity issues + * have been fixed by prior passes */ int bch2_check_directory_structure(struct bch_fs *c) { @@ -2756,6 +3003,10 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, if (S_ISDIR(u.bi_mode)) continue; + /* + * Previous passes ensured that bi_nlink is nonzero if + * it had multiple hardlinks: + */ if (!u.bi_nlink) continue; diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index a4ef94271784..1cca31011530 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -9,6 +9,7 @@ int bch2_check_dirents(struct bch_fs *); int bch2_check_xattrs(struct bch_fs *); int bch2_check_root(struct bch_fs *); int bch2_check_subvolume_structure(struct bch_fs *); +int bch2_check_unreachable_inodes(struct bch_fs *); int bch2_check_directory_structure(struct bch_fs *); int bch2_check_nlinks(struct bch_fs *); int bch2_fix_reflink_p(struct bch_fs *); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 3e5bc01961b8..039cb7a22244 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -12,6 +12,7 @@ #include "error.h" #include "extents.h" #include "extent_update.h" +#include "fs.h" #include "inode.h" #include "str_hash.h" #include "snapshot.h" @@ -34,6 +35,8 @@ static const char * const bch2_inode_flag_strs[] = { }; #undef x +static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); + static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; static int inode_decode_field(const u8 *in, const u8 *end, @@ -160,8 +163,8 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, unsigned fieldnr = 0, field_bits; int ret; -#define x(_name, _bits) \ - if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ +#define x(_name, _bits) \ + if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \ unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ memset((void *) unpacked + offset, 0, \ sizeof(*unpacked) - offset); \ @@ -280,6 +283,8 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, { memset(unpacked, 0, sizeof(*unpacked)); + unpacked->bi_snapshot = k.k->p.snapshot; + switch (k.k->type) { case KEY_TYPE_inode: { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); @@ -290,10 +295,10 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - if (INODE_NEW_VARINT(inode.v)) { + if (INODEv1_NEW_VARINT(inode.v)) { return bch2_inode_unpack_v2(unpacked, inode.v->fields, bkey_val_end(inode), - INODE_NR_FIELDS(inode.v)); + INODEv1_NR_FIELDS(inode.v)); } else { return bch2_inode_unpack_v1(inode, unpacked); } @@ -468,10 +473,10 @@ int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); int ret = 0; - bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", - INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); + INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); ret = __bch2_inode_validate(c, k, flags); fsck_err: @@ -530,6 +535,10 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, prt_printf(out, "(%x)\n", inode->bi_flags); prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); + prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed); + prt_printf(out, "hash_type="); + bch2_prt_str_hash_type(out, INODE_STR_HASH(inode)); + prt_newline(out); prt_printf(out, "bi_size=%llu\n", inode->bi_size); prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors); prt_printf(out, "bi_version=%llu\n", inode->bi_version); @@ -575,9 +584,137 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k) } } -static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f); + return; + case KEY_TYPE_inode_v2: + bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f); + return; + case KEY_TYPE_inode_v3: + bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f); + return; + default: + BUG(); + } +} + +static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) { - return bkey_inode_flags(k) & BCH_INODE_unlinked; + unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; + + return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); +} + +static struct bkey_s_c +bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos pos, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_upto_norestart(trans, *iter, btree, + bpos_successor(pos), + SPOS(pos.inode, pos.offset, U32_MAX), + flags|BTREE_ITER_all_snapshots, k, ret) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) + return k; + + bch2_trans_iter_exit(trans, iter); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; +} + +static struct bkey_s_c +bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos, unsigned flags) +{ + struct bkey_s_c k; +again: + k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags); + if (!k.k || + bkey_err(k) || + bkey_is_inode(k.k)) + return k; + + bch2_trans_iter_exit(trans, iter); + pos = k.k->p; + goto again; +} + +int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_upto_norestart(trans, iter, + BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), + BTREE_ITER_all_snapshots| + BTREE_ITER_with_updates, k, ret) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) && + bkey_is_inode(k.k)) { + ret = 1; + break; + } + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int update_inode_has_children(struct btree_trans *trans, + struct bkey_s k, + bool have_child) +{ + if (!have_child) { + int ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret) + return ret < 0 ? ret : 0; + } + + u64 f = bkey_inode_flags(k.s_c); + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) + bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot); + + return 0; +} + +static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos, + bool have_child) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans, + &iter, pos, BTREE_ITER_with_updates); + int ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) + return 0; + + if (!have_child) { + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret) { + ret = ret < 0 ? ret : 0; + goto err; + } + } + + u64 f = bkey_inode_flags(k); + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) { + struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k, + BTREE_UPDATE_internal_snapshot_node); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } int bch2_trigger_inode(struct btree_trans *trans, @@ -586,6 +723,8 @@ int bch2_trigger_inode(struct btree_trans *trans, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { BUG_ON(!trans->journal_res.seq); bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); @@ -599,13 +738,41 @@ int bch2_trigger_inode(struct btree_trans *trans, return ret; } - int deleted_delta = (int) bkey_is_deleted_inode(new.s_c) - - (int) bkey_is_deleted_inode(old); - if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, - new.k->p, deleted_delta > 0); - if (ret) - return ret; + if (flags & BTREE_TRIGGER_transactional) { + int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) - + (int) bkey_is_unlinked_inode(old); + if (unlinked_delta) { + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, + new.k->p, unlinked_delta > 0); + if (ret) + return ret; + } + + /* + * If we're creating or deleting an inode at this snapshot ID, + * and there might be an inode in a parent snapshot ID, we might + * need to set or clear the has_child_snapshot flag on the + * parent. + */ + int deleted_delta = (int) bkey_is_inode(new.k) - + (int) bkey_is_inode(old.k); + if (deleted_delta && + bch2_snapshot_parent(c, new.k->p.snapshot)) { + int ret = update_parent_inode_has_children(trans, new.k->p, + deleted_delta > 0); + if (ret) + return ret; + } + + /* + * When an inode is first updated in a new snapshot, we may need + * to clear has_child_snapshot + */ + if (deleted_delta > 0) { + int ret = update_inode_has_children(trans, new, false); + if (ret) + return ret; + } } return 0; @@ -639,10 +806,8 @@ void bch2_inode_init_early(struct bch_fs *c, memset(inode_u, 0, sizeof(*inode_u)); - /* ick */ - inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; - get_random_bytes(&inode_u->bi_hash_seed, - sizeof(inode_u->bi_hash_seed)); + SET_INODE_STR_HASH(inode_u, str_hash); + get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); } void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, @@ -888,6 +1053,11 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + if (ret) + goto err2; + + ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); +err2: bch2_trans_put(trans); return ret; } @@ -921,8 +1091,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, struct bch_inode_unpacked *inode) { - return bch2_trans_do(c, NULL, NULL, 0, - bch2_inode_find_by_inum_trans(trans, inum, inode)); + return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); } int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) @@ -992,7 +1161,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i return 0; } -int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; struct btree_iter iter = { NULL }; @@ -1055,6 +1224,45 @@ err: return ret ?: -BCH_ERR_transaction_restart_nested; } +/* + * After deleting an inode, there may be versions in older snapshots that should + * also be deleted - if they're not referenced by sibling snapshots and not open + * in other subvolumes: + */ +static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; +next_parent: + ret = lockrestart_do(trans, + bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0))); + if (ret || !k.k) + return ret; + + bool unlinked = bkey_is_unlinked_inode(k); + pos = k.k->p; + bch2_trans_iter_exit(trans, &iter); + + if (!unlinked) + return 0; + + ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); + if (ret) + return ret < 0 ? ret : 0; + + ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); + if (ret) + return ret; + goto next_parent; +} + +int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: + delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); +} + static int may_delete_deleted_inode(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos, @@ -1064,6 +1272,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct btree_iter inode_iter; struct bkey_s_c k; struct bch_inode_unpacked inode; + struct printbuf buf = PRINTBUF; int ret; k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); @@ -1099,6 +1308,31 @@ static int may_delete_deleted_inode(struct btree_trans *trans, pos.offset, pos.snapshot)) goto delete; + if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, + trans, deleted_inode_has_child_snapshots, + "inode with child snapshots %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto out; + + if (ret) { + if (fsck_err(trans, inode_has_child_snapshots_wrong, + "inode has_child_snapshots flag wrong (should be set)\n%s", + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &inode), + buf.buf))) { + inode.bi_flags |= BCH_INODE_has_child_snapshot; + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + goto out; + } + goto delete; + + } + if (test_bit(BCH_FS_clean_recovery, &c->flags) && !fsck_err(trans, deleted_inode_but_clean, "filesystem marked as clean but have deleted inode %llu:%u", @@ -1107,33 +1341,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; } - if (bch2_snapshot_is_internal_node(c, pos.snapshot)) { - struct bpos new_min_pos; - - ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos); - if (ret) - goto out; - - inode.bi_flags &= ~BCH_INODE_unlinked; - - ret = bch2_inode_write_flags(trans, &inode_iter, &inode, - BTREE_UPDATE_internal_snapshot_node); - bch_err_msg(c, ret, "clearing inode unlinked flag"); - if (ret) - goto out; - - /* - * We'll need another write buffer flush to pick up the new - * unlinked inodes in the snapshot leaves: - */ - *need_another_pass = true; - goto out; - } - ret = 1; out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); + printbuf_exit(&buf); return ret; delete: ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 9c1f67705684..eab82b5eb897 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -5,6 +5,7 @@ #include "bkey.h" #include "bkey_methods.h" #include "opts.h" +#include "snapshot.h" enum bch_validate_flags; extern const char * const bch2_inode_opts[]; @@ -17,6 +18,15 @@ int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); + +static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) +{ + return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0 + ? __bch2_inode_has_child_snapshots(trans, pos) + : 0; +} + int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); @@ -82,6 +92,7 @@ struct bch_inode_unpacked { BCH_INODE_FIELDS_v3() #undef x }; +BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24); struct bkey_inode_buf { struct bkey_i_inode_v3 inode; diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 83d107331edf..7928d0c6954f 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -133,7 +133,8 @@ enum inode_opt_id { x(i_size_dirty, 5) \ x(i_sectors_dirty, 6) \ x(unlinked, 7) \ - x(backptr_untrusted, 8) + x(backptr_untrusted, 8) \ + x(has_child_snapshot, 9) /* bits 20+ reserved for packed fields below: */ @@ -149,9 +150,9 @@ enum __bch_inode_flags { #undef x }; -LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); +LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24); +LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32); LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 307ed0a45184..f283051758d6 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -377,7 +377,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, * check for missing subvolume before fpunch, as in resume we don't want * it to be a fatal error */ - ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors); + ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors)); if (ret) return ret; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index e4fc17c548fd..b3b934a87c6d 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -262,7 +262,8 @@ err: bio_free_pages(&(*rbio)->bio); kfree(*rbio); *rbio = NULL; - kfree(op); + /* We may have added to the rhashtable and thus need rcu freeing: */ + kfree_rcu(op, rcu); bch2_write_ref_put(c, BCH_WRITE_REF_promote); return ERR_PTR(ret); } @@ -409,8 +410,8 @@ retry: bch2_trans_begin(trans); rbio->bio.bi_status = 0; - k = bch2_btree_iter_peek_slot(&iter); - if (bkey_err(k)) + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (ret) goto err; bch2_bkey_buf_reassemble(&sk, c, k); @@ -557,8 +558,8 @@ out: static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { - bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_rbio_narrow_crcs(trans, rbio)); + bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_rbio_narrow_crcs(trans, rbio)); } /* Inner part that may run in process context */ @@ -802,16 +803,15 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, PTR_BUCKET_POS(ca, &ptr), BTREE_ITER_cached); - u8 *gen = bucket_gen(ca, iter.pos.offset); - if (gen) { - + int gen = bucket_gen_get(ca, iter.pos.offset); + if (gen >= 0) { prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); printbuf_indent_add(&buf, 2); bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); - prt_printf(&buf, "memory gen: %u", *gen); + prt_printf(&buf, "memory gen: %u", gen); ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); if (!ret) { diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index b5fe9e0dc155..96720adcfee0 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1300,11 +1300,8 @@ retry: bucket_to_u64(i->b), BUCKET_NOCOW_LOCK_UPDATE); - rcu_read_lock(); - u8 *gen = bucket_gen(ca, i->b.offset); - stale = !gen ? -1 : gen_after(*gen, i->gen); - rcu_read_unlock(); - + int gen = bucket_gen_get(ca, i->b.offset); + stale = gen < 0 ? gen : gen_after(gen, i->gen); if (unlikely(stale)) { stale_at = i; goto err_bucket_stale; @@ -1437,7 +1434,7 @@ again: * freeing up space on specific disks, which means that * allocations for specific disks may hang arbitrarily long: */ - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_run(c, lockrestart_do(trans, bch2_alloc_sectors_start_trans(trans, op->target, op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), @@ -1447,7 +1444,7 @@ again: op->nr_replicas_required, op->watermark, op->flags, - &op->cl, &wp)); + &op->cl, &wp))); if (unlikely(ret)) { if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) break; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index f5f7db50ca31..2dc0d60c1745 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -603,6 +603,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, { int ret; + if (closure_wait_event_timeout(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK), + HZ * 10)) + return ret; + + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct printbuf buf = PRINTBUF; + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", + buf.buf); + printbuf_exit(&buf); + closure_wait_event(&j->async_wait, (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || (flags & JOURNAL_RES_GET_NONBLOCK)); @@ -745,7 +758,7 @@ out: return ret; } -int bch2_journal_flush_seq(struct journal *j, u64 seq) +int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state) { u64 start_time = local_clock(); int ret, ret2; @@ -756,7 +769,9 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) if (seq <= j->flushed_seq_ondisk) return 0; - ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); + ret = wait_event_state(j->wait, + (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)), + task_state); if (!ret) bch2_time_stats_update(j->flush_seq_time, start_time); @@ -775,7 +790,7 @@ void bch2_journal_flush_async(struct journal *j, struct closure *parent) int bch2_journal_flush(struct journal *j) { - return bch2_journal_flush_seq(j, atomic64_read(&j->seq)); + return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE); } /* @@ -838,7 +853,7 @@ int bch2_journal_meta(struct journal *j) bch2_journal_res_put(j, &res); - return bch2_journal_flush_seq(j, res.seq); + return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); } /* block/unlock the journal: */ diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 377a3750406e..2762be6f9814 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -401,7 +401,7 @@ void bch2_journal_entry_res_resize(struct journal *, int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); void bch2_journal_flush_async(struct journal *, struct closure *); -int bch2_journal_flush_seq(struct journal *, u64); +int bch2_journal_flush_seq(struct journal *, u64, unsigned); int bch2_journal_flush(struct journal *); bool bch2_journal_noflush_seq(struct journal *, u64); int bch2_journal_meta(struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 954f6a96e0f4..fb35dd336331 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -708,6 +708,9 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs container_of(entry, struct jset_entry_dev_usage, entry); unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + if (vstruct_bytes(entry) < sizeof(*u)) + return; + prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); printbuf_indent_add(out, 2); @@ -1012,6 +1015,8 @@ reread: nr_bvecs = buf_pages(buf->data, sectors_read << 9); bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + if (!bio) + return -BCH_ERR_ENOMEM_journal_read_bucket; bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); bio->bi_iter.bi_sector = offset; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 8c456d8b8b99..0ef4a86850bb 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -266,7 +266,7 @@ int bch2_move_extent(struct moving_context *ctxt, if (!data_opts.rewrite_ptrs && !data_opts.extra_replicas) { if (data_opts.kill_ptrs) - return bch2_extent_drop_ptrs(trans, iter, k, data_opts); + return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); return 0; } diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 232be8a44051..0e2ee262fbd4 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -63,7 +63,7 @@ const char * const bch2_compression_opts[] = { NULL }; -const char * const bch2_str_hash_types[] = { +const char * const __bch2_str_hash_types[] = { BCH_STR_HASH_TYPES() NULL }; @@ -115,6 +115,7 @@ PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); +PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) @@ -225,7 +226,7 @@ const struct bch_option bch2_opt_table[] = { #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ .min = _min, .max = _max #define OPT_STR(_choices) .type = BCH_OPT_STR, \ - .min = 0, .max = ARRAY_SIZE(_choices), \ + .min = 0, .max = ARRAY_SIZE(_choices) - 1, \ .choices = _choices #define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ .min = 0, .max = U64_MAX, \ @@ -427,7 +428,9 @@ void bch2_opt_to_text(struct printbuf *out, prt_printf(out, "%lli", v); break; case BCH_OPT_STR: - if (flags & OPT_SHOW_FULL_LIST) + if (v < opt->min || v >= opt->max) + prt_printf(out, "(invalid option %lli)", v); + else if (flags & OPT_SHOW_FULL_LIST) prt_string_option(out, opt->choices, v); else prt_str(out, opt->choices[v]); @@ -594,6 +597,9 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, copied_opts_start = copied_opts; while ((opt = strsep(&copied_opts, ",")) != NULL) { + if (!*opt) + continue; + name = strsep(&opt, "="); val = opt; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index cb2e244a2429..23dda014e331 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -18,7 +18,7 @@ extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; extern const char * const bch2_csum_opts[]; extern const char * const bch2_compression_opts[]; -extern const char * const bch2_str_hash_types[]; +extern const char * const __bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; @@ -29,6 +29,7 @@ void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); void bch2_prt_data_type(struct printbuf *, enum bch_data_type); void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); +void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); static inline const char *bch2_d_type_str(unsigned d_type) { diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index c32a05e252e2..74f45a8162ad 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -869,7 +869,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, bkey_quota_init(&new_quota.k_i); new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_set_quota_trans(trans, &new_quota, qdq)) ?: __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 2d299a37cf07..cd6647374353 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -70,7 +70,9 @@ err: int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) { - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, + int ret = bch2_trans_commit_do(c, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_lazy_rw, __bch2_set_rebalance_needs_scan(trans, inum)); rebalance_wakeup(c); return ret; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 6db72d3bad7d..3c7f941dde39 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -94,11 +94,10 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); @@ -287,7 +286,8 @@ int bch2_journal_replay(struct bch_fs *c) BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_skip_accounting_apply| - BCH_TRANS_COMMIT_no_journal_res, + BCH_TRANS_COMMIT_no_journal_res| + BCH_WATERMARK_reclaim, bch2_journal_replay_accounting_key(trans, k)); if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) goto err; @@ -862,6 +862,13 @@ use_clean: if (ret) goto err; + /* + * Normally set by the appropriate recovery pass: when cleared, this + * indicates we're in early recovery and btree updates should be done by + * being applied to the journal replay keys. _Must_ be cleared before + * multithreaded use: + */ + set_bit(BCH_FS_may_go_rw, &c->flags); clear_bit(BCH_FS_fsck_running, &c->flags); /* in case we don't run journal replay, i.e. norecovery mode */ @@ -1001,6 +1008,7 @@ int bch2_fs_initialize(struct bch_fs *c) struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; struct qstr lostfound = QSTR("lost+found"); + struct bch_member *m; int ret; bch_notice(c, "initializing new filesystem"); @@ -1017,6 +1025,14 @@ int bch2_fs_initialize(struct bch_fs *c) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); bch2_write_super(c); } + + for_each_member_device(c, ca) { + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); + ca->mi = bch2_mi_to_cpu(m); + } + + bch2_write_super(c); mutex_unlock(&c->sb_lock); c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; @@ -1090,7 +1106,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_init_early(c, &lostfound_inode); - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_create_trans(trans, BCACHEFS_ROOT_SUBVOL_INUM, &root_inode, &lostfound_inode, diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 735b8adc8f9d..dff589ddc984 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -27,6 +27,12 @@ const char * const bch2_recovery_passes[] = { NULL }; +/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ +static int bch2_recovery_pass_empty(struct bch_fs *c) +{ + return 0; +} + static int bch2_set_may_go_rw(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; @@ -221,6 +227,12 @@ int bch2_run_recovery_passes(struct bch_fs *c) { int ret = 0; + /* + * We can't allow set_may_go_rw to be excluded; that would cause us to + * use the journal replay keys for updates where it's not expected. + */ + c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { if (c->opts.recovery_pass_last && c->curr_recovery_pass > c->opts.recovery_pass_last) diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 50406ce0e4ef..94dc20ca2065 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -13,6 +13,7 @@ * must never change: */ #define BCH_RECOVERY_PASSES() \ + x(recovery_pass_empty, 41, PASS_SILENT) \ x(scan_for_btree_nodes, 37, 0) \ x(check_topology, 4, 0) \ x(accounting_read, 39, PASS_ALWAYS) \ @@ -46,6 +47,7 @@ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index bcb3276747e0..477ef0997949 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } -static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r, - struct bch_sb *sb, - struct printbuf *err) +static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, + struct bch_sb *sb, + struct printbuf *err) { if (!r->nr_devs) { prt_printf(err, "no devices in entry "); @@ -98,10 +98,28 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, struct bch_fs *c, struct printbuf *err) { - mutex_lock(&c->sb_lock); - int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err); - mutex_unlock(&c->sb_lock); - return ret; + if (!r->nr_devs) { + prt_printf(err, "no devices in entry "); + goto bad; + } + + if (r->nr_required > 1 && + r->nr_required >= r->nr_devs) { + prt_printf(err, "bad nr_required in entry "); + goto bad; + } + + for (unsigned i = 0; i < r->nr_devs; i++) + if (r->devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_dev_exists(c, r->devs[i])) { + prt_printf(err, "invalid device %u in entry ", r->devs[i]); + goto bad; + } + + return 0; +bad: + bch2_replicas_entry_to_text(err, r); + return -BCH_ERR_invalid_replicas_entry; } void bch2_cpu_replicas_to_text(struct printbuf *out, @@ -686,7 +704,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); - int ret = bch2_replicas_entry_validate_locked(e, sb, err); + int ret = bch2_replicas_entry_sb_validate(e, sb, err); if (ret) return ret; @@ -803,6 +821,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, rcu_read_lock(); for (unsigned i = 0; i < e->nr_devs; i++) { + if (e->devs[i] == BCH_SB_MEMBER_INVALID) { + nr_failed++; + continue; + } + nr_online += test_bit(e->devs[i], devs.d); struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 5102059a0f1d..8767c33c2b51 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -78,7 +78,10 @@ BCH_FSCK_ERR_accounting_mismatch) \ x(rebalance_work_acct_fix, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) + BCH_FSCK_ERR_accounting_mismatch) \ + x(inode_has_child_snapshots, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_inode_has_child_snapshots_wrong) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ @@ -140,6 +143,9 @@ UPGRADE_TABLE() static int have_stripes(struct bch_fs *c) { + if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b)) + return 0; + return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b); } diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 4135b1ea2fec..9feb6739f77a 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -136,7 +136,9 @@ enum bch_fsck_flags { x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ x(need_discard_freespace_key_bad, 124, 0) \ + x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ x(backpointer_bucket_offset_wrong, 125, 0) \ + x(backpointer_level_bad, 294, 0) \ x(backpointer_to_missing_device, 126, 0) \ x(backpointer_to_missing_alloc, 127, 0) \ x(backpointer_to_missing_ptr, 128, 0) \ @@ -177,9 +179,12 @@ enum bch_fsck_flags { x(ptr_stripe_redundant, 163, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_v_pos_bad, 292, 0) \ x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(reflink_refcount_underflow, 293, 0) \ x(stripe_pos_bad, 167, 0) \ x(stripe_val_size_bad, 168, 0) \ + x(stripe_csum_granularity_bad, 290, 0) \ x(stripe_sector_count_wrong, 169, 0) \ x(snapshot_tree_pos_bad, 170, 0) \ x(snapshot_tree_to_missing_snapshot, 171, 0) \ @@ -225,11 +230,13 @@ enum bch_fsck_flags { x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ + x(inode_has_child_snapshots_wrong, 287, 0) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ + x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ x(extent_overlapping, 215, 0) \ x(key_in_missing_inode, 216, 0) \ x(key_in_wrong_inode_type, 217, 0) \ @@ -264,8 +271,8 @@ enum bch_fsck_flags { x(journal_entry_dup_same_device, 246, 0) \ x(inode_bi_subvol_missing, 247, 0) \ x(inode_bi_subvol_wrong, 248, 0) \ - x(inode_points_to_missing_dirent, 249, 0) \ - x(inode_points_to_wrong_dirent, 250, 0) \ + x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \ + x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \ x(inode_bi_parent_nonzero, 251, 0) \ x(dirent_to_missing_parent_subvol, 252, 0) \ x(dirent_not_visible_in_parent_subvol, 253, 0) \ @@ -289,6 +296,7 @@ enum bch_fsck_flags { x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ + x(accounting_to_invalid_device, 289, 0) \ x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ @@ -298,7 +306,7 @@ enum bch_fsck_flags { x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(MAX, 287, 0) + x(MAX, 295, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 02bcde3c1b02..116131f95815 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -163,6 +163,11 @@ static int validate_member(struct printbuf *err, return -BCH_ERR_invalid_sb_members; } + if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) { + prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift); + return -BCH_ERR_invalid_sb_members; + } + return 0; } @@ -247,7 +252,10 @@ static void member_to_text(struct printbuf *out, prt_newline(out); prt_printf(out, "Btree allocated bitmap blocksize:\t"); - prt_units_u64(out, 1ULL << m.btree_bitmap_shift); + if (m.btree_bitmap_shift < 64) + prt_units_u64(out, 1ULL << m.btree_bitmap_shift); + else + prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); prt_newline(out); prt_printf(out, "Btree allocated bitmap:\t"); @@ -442,7 +450,7 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns m->btree_bitmap_shift += resize; } - BUG_ON(m->btree_bitmap_shift > 57); + BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX); BUG_ON(end > 64ULL << m->btree_bitmap_shift); for (unsigned bit = start >> m->btree_bitmap_shift; diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index d727d2dfda08..2adf1221a440 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -66,6 +66,12 @@ struct bch_member { }; /* + * btree_allocated_bitmap can represent sector addresses of a u64: it itself has + * 64 elements, so 64 - ilog2(64) + */ +#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58 + +/* * This limit comes from the bucket_gens array - it's a single allocation, and * kernel allocation are limited to INT_MAX */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 1809442b00ee..ae57638506c3 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -905,12 +905,30 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) if (bch2_snapshot_equiv(c, id)) return 0; - /* 0 is an invalid tree ID */ + /* Do we need to reconstruct the snapshot_tree entry as well? */ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; u32 tree_id = 0; - int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + + for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, + 0, k, ret) { + if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { + tree_id = k.k->p.offset; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + if (ret) return ret; + if (!tree_id) { + ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + if (ret) + return ret; + } + struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); ret = PTR_ERR_OR_ZERO(snapshot); if (ret) @@ -921,6 +939,16 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) snapshot->v.tree = cpu_to_le32(tree_id); snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { + if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { + snapshot->v.subvol = cpu_to_le32(k.k->p.offset); + SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: @@ -1732,103 +1760,6 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return ret; } -static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - - return s->children[1] ?: s->children[0]; -} - -static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) -{ - u32 child; - - while ((child = bch2_snapshot_smallest_child(c, id))) - id = child; - return id; -} - -static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, - enum btree_id btree, - struct bkey_s_c interior_k, - u32 leaf_id, struct bpos *new_min_pos) -{ - struct btree_iter iter; - struct bpos pos = interior_k.k->p; - struct bkey_s_c k; - struct bkey_i *new; - int ret; - - pos.snapshot = leaf_id; - - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto out; - - /* key already overwritten in this snapshot? */ - if (k.k->p.snapshot != interior_k.k->p.snapshot) - goto out; - - if (bpos_eq(*new_min_pos, POS_MIN)) { - *new_min_pos = k.k->p; - new_min_pos->snapshot = leaf_id; - } - - new = bch2_bkey_make_mut_noupdate(trans, interior_k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - new->k.p.snapshot = leaf_id; - ret = bch2_trans_update(trans, &iter, new, 0); -out: - bch2_set_btree_iter_dontneed(&iter); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, - enum btree_id btree, - struct bkey_s_c k, - struct bpos *new_min_pos) -{ - struct bch_fs *c = trans->c; - struct bkey_buf sk; - u32 restart_count = trans->restart_count; - int ret = 0; - - bch2_bkey_buf_init(&sk); - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - *new_min_pos = POS_MIN; - - for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); - id < k.k->p.snapshot; - id++) { - if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || - !bch2_snapshot_is_leaf(c, id)) - continue; -again: - ret = btree_trans_too_many_iters(trans) ?: - bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?: - bch2_trans_commit(trans, NULL, NULL, 0); - if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bch2_trans_begin(trans); - goto again; - } - - if (ret) - break; - } - - bch2_bkey_buf_exit(&sk, c); - - return ret ?: trans_was_restarted(trans, restart_count); -} - static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index eb5ef64221d6..29c94716293e 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -259,9 +259,6 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return __bch2_key_has_snapshot_overwrites(trans, id, pos); } -int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id, - struct bkey_s_c, struct bpos *); - int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 215eed4cce6d..ec2b1feea520 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -46,8 +46,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) { /* XXX ick */ struct bch_hash_info info = { - .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & - ~(~0U << INODE_STR_HASH_BITS), + .type = INODE_STR_HASH(bi), .siphash_key = { .k0 = bi->bi_hash_seed } }; @@ -253,19 +252,20 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, } static __always_inline -int bch2_hash_set_in_snapshot(struct btree_trans *trans, +struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, + struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, struct bkey_i *insert, enum btree_iter_update_trigger_flags flags) { - struct btree_iter iter, slot = { NULL }; + struct btree_iter slot = {}; struct bkey_s_c k; bool found = false; int ret; - for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, + for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(insert->k.p.inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), @@ -280,7 +280,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, } if (!slot.path && !(flags & STR_HASH_must_replace)) - bch2_trans_copy_iter(&slot, &iter); + bch2_trans_copy_iter(&slot, iter); if (k.k->type != KEY_TYPE_hash_whiteout) goto not_found; @@ -290,29 +290,50 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, ret = -BCH_ERR_ENOSPC_str_hash_create; out: bch2_trans_iter_exit(trans, &slot); - bch2_trans_iter_exit(trans, &iter); - - return ret; + bch2_trans_iter_exit(trans, iter); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; found: found = true; not_found: - - if (!found && (flags & STR_HASH_must_replace)) { + if (found && (flags & STR_HASH_must_create)) { + bch2_trans_iter_exit(trans, &slot); + return k; + } else if (!found && (flags & STR_HASH_must_replace)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; - } else if (found && (flags & STR_HASH_must_create)) { - ret = -BCH_ERR_EEXIST_str_hash_set; } else { if (!found && slot.path) - swap(iter, slot); + swap(*iter, slot); - insert->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, insert, flags); + insert->k.p = iter->pos; + ret = bch2_trans_update(trans, iter, insert, flags); } goto out; } static __always_inline +int bch2_hash_set_in_snapshot(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, u32 snapshot, + struct bkey_i *insert, + enum btree_iter_update_trigger_flags flags) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum, + snapshot, insert, flags); + int ret = bkey_err(k); + if (ret) + return ret; + if (k.k) { + bch2_trans_iter_exit(trans, &iter); + return -BCH_ERR_EEXIST_str_hash_set; + } + + return 0; +} + +static __always_inline int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, @@ -363,8 +384,11 @@ int bch2_hash_delete(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key, BTREE_ITER_intent); - int ret = bkey_err(k) ?: - bch2_hash_delete_at(trans, desc, info, &iter, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 91d8187ee168..80e5efaff524 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -319,8 +319,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol) { - return bch2_trans_do(c, NULL, NULL, 0, - bch2_subvol_is_ro_trans(trans, subvol)); + return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol)); } int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, @@ -676,8 +675,8 @@ err: /* set bi_subvol on root inode */ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) { - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, - __bch2_fs_upgrade_for_subvolumes(trans)); + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + __bch2_fs_upgrade_for_subvolumes(trans)); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index ce7410d72089..7c71594f6a8b 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -287,6 +287,11 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out return -BCH_ERR_invalid_sb_layout_nr_superblocks; } + if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) { + prt_printf(out, "Invalid superblock layout: max_size_bits too high"); + return -BCH_ERR_invalid_sb_layout_sb_max_size_bits; + } + max_sectors = 1 << layout->sb_max_size_bits; prev_offset = le64_to_cpu(layout->sb_offset[0]); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 873e4be7e1dc..a6ed9a0bf1c7 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -184,6 +184,7 @@ static DEFINE_MUTEX(bch_fs_list_lock); DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); +static void bch2_dev_unlink(struct bch_dev *); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); @@ -271,6 +272,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) clean_passes++; if (bch2_btree_interior_updates_flush(c) || + bch2_btree_write_buffer_flush_going_ro(c) || bch2_journal_flush_all_pins(&c->journal) || bch2_btree_flush_all_writes(c) || seq != atomic64_read(&c->journal.seq)) { @@ -620,9 +622,7 @@ void __bch2_fs_stop(struct bch_fs *c) up_write(&c->state_lock); for_each_member_device(c, ca) - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + bch2_dev_unlink(ca); if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); @@ -1187,9 +1187,7 @@ static void bch2_dev_free(struct bch_dev *ca) { cancel_work_sync(&ca->io_error_work); - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + bch2_dev_unlink(ca); if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); @@ -1226,10 +1224,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) percpu_ref_kill(&ca->io_ref); wait_for_completion(&ca->io_ref_completion); - if (ca->kobj.state_in_sysfs) { - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); - sysfs_remove_link(&ca->kobj, "block"); - } + bch2_dev_unlink(ca); bch2_free_super(&ca->disk_sb); bch2_dev_journal_exit(ca); @@ -1251,6 +1246,26 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref) complete(&ca->io_ref_completion); } +static void bch2_dev_unlink(struct bch_dev *ca) +{ + struct kobject *b; + + /* + * This is racy w.r.t. the underlying block device being hot-removed, + * which removes it from sysfs. + * + * It'd be lovely if we had a way to handle this race, but the sysfs + * code doesn't appear to provide a good method and block/holder.c is + * susceptible as well: + */ + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev && + (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { + sysfs_remove_link(b, "bcachefs"); + sysfs_remove_link(&ca->kobj, "block"); + } +} + static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) { int ret; @@ -1958,7 +1973,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) }; u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; - ret = bch2_trans_do(ca->fs, NULL, NULL, 0, + ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index b2f209743afe..fb5c1543e52f 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -450,7 +450,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, k.k_i.k.p.snapshot = snapid; k.k_i.k.size = len; - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, BTREE_UPDATE_internal_snapshot_node)); bch_err_fn(c, ret); @@ -510,7 +510,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr) if (ret) return ret; - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_snapshot_node_create(trans, U32_MAX, snapids, snapid_subvols, @@ -809,6 +809,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, unsigned i; u64 time; + if (nr == 0 || nr_threads == 0) { + pr_err("nr of iterations or threads is not allowed to be 0"); + return -EINVAL; + } + atomic_set(&j.ready, nr_threads); init_waitqueue_head(&j.ready_wait); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 56c8d3fe55a4..952aca400faf 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -330,7 +330,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = bch2_trans_do(c, NULL, NULL, 0, + int ret = bch2_trans_do(c, bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); if (ret < 0 && bch2_err_matches(ret, ENOENT)) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index f92f108840f5..8f430ff8e445 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -11,12 +11,13 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/errno.h> #include <linux/stat.h> #include <linux/nls.h> #include <linux/buffer_head.h> #include <linux/vfs.h> -#include <linux/parser.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/cred.h> @@ -54,22 +55,20 @@ static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static void befs_put_super(struct super_block *); -static int befs_remount(struct super_block *, int *, char *); static int befs_statfs(struct dentry *, struct kstatfs *); static int befs_show_options(struct seq_file *, struct dentry *); -static int parse_options(char *, struct befs_mount_options *); static struct dentry *befs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); static struct dentry *befs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); static struct dentry *befs_get_parent(struct dentry *child); +static void befs_free_fc(struct fs_context *fc); static const struct super_operations befs_sops = { .alloc_inode = befs_alloc_inode, /* allocate a new inode */ .free_inode = befs_free_inode, /* deallocate an inode */ .put_super = befs_put_super, /* uninit super */ .statfs = befs_statfs, /* statfs */ - .remount_fs = befs_remount, .show_options = befs_show_options, }; @@ -672,92 +671,53 @@ static struct dentry *befs_get_parent(struct dentry *child) } enum { - Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, + Opt_uid, Opt_gid, Opt_charset, Opt_debug, }; -static const match_table_t befs_tokens = { - {Opt_uid, "uid=%d"}, - {Opt_gid, "gid=%d"}, - {Opt_charset, "iocharset=%s"}, - {Opt_debug, "debug"}, - {Opt_err, NULL} +static const struct fs_parameter_spec befs_param_spec[] = { + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_string ("iocharset", Opt_charset), + fsparam_flag ("debug", Opt_debug), + {} }; static int -parse_options(char *options, struct befs_mount_options *opts) +befs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - kuid_t uid; - kgid_t gid; - - /* Initialize options */ - opts->uid = GLOBAL_ROOT_UID; - opts->gid = GLOBAL_ROOT_GID; - opts->use_uid = 0; - opts->use_gid = 0; - opts->iocharset = NULL; - opts->debug = 0; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, befs_tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - return 0; - uid = INVALID_UID; - if (option >= 0) - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) { - pr_err("Invalid uid %d, " - "using default\n", option); - break; - } - opts->uid = uid; - opts->use_uid = 1; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - gid = INVALID_GID; - if (option >= 0) - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) { - pr_err("Invalid gid %d, " - "using default\n", option); - break; - } - opts->gid = gid; - opts->use_gid = 1; - break; - case Opt_charset: - kfree(opts->iocharset); - opts->iocharset = match_strdup(&args[0]); - if (!opts->iocharset) { - pr_err("allocation failure for " - "iocharset string\n"); - return 0; - } - break; - case Opt_debug: - opts->debug = 1; - break; - default: - pr_err("Unrecognized mount option \"%s\" " - "or missing value\n", p); - return 0; - } + struct befs_mount_options *opts = fc->fs_private; + int token; + struct fs_parse_result result; + + /* befs ignores all options on remount */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + return 0; + + token = fs_parse(fc, befs_param_spec, param, &result); + if (token < 0) + return token; + + switch (token) { + case Opt_uid: + opts->uid = result.uid; + opts->use_uid = 1; + break; + case Opt_gid: + opts->gid = result.gid; + opts->use_gid = 1; + break; + case Opt_charset: + kfree(opts->iocharset); + opts->iocharset = param->string; + param->string = NULL; + break; + case Opt_debug: + opts->debug = 1; + break; + default: + return -EINVAL; } - return 1; + return 0; } static int befs_show_options(struct seq_file *m, struct dentry *root) @@ -793,6 +753,21 @@ befs_put_super(struct super_block *sb) sb->s_fs_info = NULL; } +/* + * Copy the parsed options into the sbi mount_options member + */ +static void +befs_set_options(struct befs_sb_info *sbi, struct befs_mount_options *opts) +{ + sbi->mount_opts.uid = opts->uid; + sbi->mount_opts.gid = opts->gid; + sbi->mount_opts.use_uid = opts->use_uid; + sbi->mount_opts.use_gid = opts->use_gid; + sbi->mount_opts.debug = opts->debug; + sbi->mount_opts.iocharset = opts->iocharset; + opts->iocharset = NULL; +} + /* Allocate private field of the superblock, fill it. * * Finish filling the public superblock fields @@ -800,7 +775,7 @@ befs_put_super(struct super_block *sb) * Load a set of NLS translations if needed. */ static int -befs_fill_super(struct super_block *sb, void *data, int silent) +befs_fill_super(struct super_block *sb, struct fs_context *fc) { struct buffer_head *bh; struct befs_sb_info *befs_sb; @@ -810,6 +785,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) const unsigned long sb_block = 0; const off_t x86_sb_off = 512; int blocksize; + struct befs_mount_options *parsed_opts = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); if (sb->s_fs_info == NULL) @@ -817,11 +794,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) befs_sb = BEFS_SB(sb); - if (!parse_options((char *) data, &befs_sb->mount_opts)) { - if (!silent) - befs_error(sb, "cannot parse mount options"); - goto unacquire_priv_sbp; - } + befs_set_options(befs_sb, parsed_opts); befs_debug(sb, "---> %s", __func__); @@ -934,10 +907,10 @@ unacquire_none: } static int -befs_remount(struct super_block *sb, int *flags, char *data) +befs_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - if (!(*flags & SB_RDONLY)) + sync_filesystem(fc->root->d_sb); + if (!(fc->sb_flags & SB_RDONLY)) return -EINVAL; return 0; } @@ -965,19 +938,51 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static struct dentry * -befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data) +static int befs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, befs_fill_super); +} + +static const struct fs_context_operations befs_context_ops = { + .parse_param = befs_parse_param, + .get_tree = befs_get_tree, + .reconfigure = befs_reconfigure, + .free = befs_free_fc, +}; + +static int befs_init_fs_context(struct fs_context *fc) +{ + struct befs_mount_options *opts; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + /* Initialize options */ + opts->uid = GLOBAL_ROOT_UID; + opts->gid = GLOBAL_ROOT_GID; + + fc->fs_private = opts; + fc->ops = &befs_context_ops; + + return 0; +} + +static void befs_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super); + struct befs_mount_options *opts = fc->fs_private; + + kfree(opts->iocharset); + kfree(fc->fs_private); } static struct file_system_type befs_fs_type = { .owner = THIS_MODULE, .name = "befs", - .mount = befs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = befs_init_fs_context, + .parameters = befs_param_spec, }; MODULE_ALIAS_FS("befs"); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 06dc4a57ba78..3039a6b7aba4 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -258,6 +258,12 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); #endif +#ifdef ELF_HWCAP3 + NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3); +#endif +#ifdef ELF_HWCAP4 + NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4); +#endif NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { NEW_AUX_ENT(AT_PLATFORM, diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 4fe5bb9f1b1f..31d253bd3961 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -624,6 +624,12 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); #endif +#ifdef ELF_HWCAP3 + NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3); +#endif +#ifdef ELF_HWCAP4 + NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4); +#endif NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 4fb925e8c981..fa8515598341 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -78,6 +78,32 @@ config BTRFS_ASSERT If unsure, say N. +config BTRFS_EXPERIMENTAL + bool "Btrfs experimental features" + depends on BTRFS_FS + default n + help + Enable experimental features. These features may not be stable enough + for end users. This is meant for btrfs developers or users who wish + to test the functionality and report problems. + + Current list: + + - extent map shrinker - performance problems with too frequent shrinks + + - send stream protocol v3 - fs-verity support + + - checksum offload mode - sysfs knob to affect when checksums are + calculated (at IO time, or in a thread) + + - raid-stripe-tree - additional mapping of extents to devices to + support RAID1* profiles on zoned devices, + RAID56 not yet supported + + - extent tree v2 - complex rework of extent tracking + + If unsure, say N. + config BTRFS_FS_REF_VERIFY bool "Btrfs with the ref verify tool compiled in" depends on BTRFS_FS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 87617f2968bc..3cfc440c636c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -43,4 +43,5 @@ btrfs-$(CONFIG_FS_VERITY) += verity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ - tests/free-space-tree-tests.o tests/extent-map-tests.o + tests/free-space-tree-tests.o tests/extent-map-tests.o \ + tests/raid-stripe-tree-tests.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f8e1d5b2c512..04f53ca548e1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1442,7 +1442,8 @@ again: */ delayed_refs = &ctx->trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); + head = btrfs_find_delayed_ref_head(ctx->fs_info, delayed_refs, + ctx->bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index fec5c6cde0a7..1f216d07eff6 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, bbio->end_io = end_io; bbio->private = private; atomic_set(&bbio->pending_ios, 1); + WRITE_ONCE(bbio->status, BLK_STS_OK); } /* @@ -113,41 +114,29 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) } } -static void btrfs_orig_write_end_io(struct bio *bio); - -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, - struct btrfs_bio *orig_bbio) -{ - /* - * For writes we tolerate nr_mirrors - 1 write failures, so we can't - * just blindly propagate a write failure here. Instead increment the - * error count in the original I/O context so that it is guaranteed to - * be larger than the error tolerance. - */ - if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { - struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; - struct btrfs_io_context *orig_bioc = orig_stripe->bioc; - - atomic_add(orig_bioc->max_errors, &orig_bioc->error); - } else { - orig_bbio->bio.bi_status = bbio->bio.bi_status; - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - if (bbio->bio.bi_status) - btrfs_bbio_propagate_error(bbio, orig_bbio); btrfs_cleanup_bio(bbio); bbio = orig_bbio; } - if (atomic_dec_and_test(&bbio->pending_ios)) + /* + * At this point, bbio always points to the original btrfs_bio. Save + * the first error in it. + */ + if (status != BLK_STS_OK) + cmpxchg(&bbio->status, BLK_STS_OK, status); + + if (atomic_dec_and_test(&bbio->pending_ios)) { + /* Load split bio's error which might be set above. */ + if (status == BLK_STS_OK) + bbio->bio.bi_status = READ_ONCE(bbio->status); __btrfs_bio_end_io(bbio); + } } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) @@ -598,7 +587,7 @@ static bool should_async_write(struct btrfs_bio *bbio) { bool auto_csum_mode = true; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index e48612340745..e2fe16074ad6 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -79,6 +79,9 @@ struct btrfs_bio { /* File system that this I/O operates on. */ struct btrfs_fs_info *fs_info; + /* Save the first error status of split bio. */ + blk_status_t status; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 7980b2e33a92..4427c1b835e8 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2797,7 +2797,7 @@ next: * uncompressed data size, because the compression is only done * when writeback triggered and we don't know how much space we * are actually going to need, so we reserve the uncompressed - * size because the data may be uncompressible in the worst case. + * size because the data may be incompressible in the worst case. */ if (ret == 0) { bool used; @@ -3819,6 +3819,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, spin_lock(&cache->lock); if (cache->ro) space_info->bytes_readonly += num_bytes; + else if (btrfs_is_zoned(cache->fs_info)) + space_info->bytes_zone_unusable += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index e152fde888fc..aa1f55cd81b7 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -577,7 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state struct extent_state *other); void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); void btrfs_evict_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); @@ -613,11 +612,17 @@ int btrfs_writepage_cow_fixup(struct folio *folio); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, - struct page **pages); + u64 disk_bytenr, u64 disk_io_size, + struct page **pages, void *uring_ctx); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded); + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size); +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked); ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 90aef2627ca2..0c4d486c3048 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -453,7 +453,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - folio = __filemap_get_folio(mapping, pg_index, 0, 0); + folio = filemap_get_folio(mapping, pg_index); if (!IS_ERR(folio)) { u64 folio_sz = folio_size(folio); u64 offset = offset_in_folio(folio, cur); @@ -545,8 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, folio, cur, - add_size); + btrfs_folio_set_lock(fs_info, folio, cur, add_size); folio_put(folio); cur += add_size; } @@ -702,7 +701,7 @@ static void free_heuristic_ws(struct list_head *ws) kfree(workspace); } -static struct list_head *alloc_heuristic_ws(unsigned int level) +static struct list_head *alloc_heuristic_ws(void) { struct heuristic_ws *ws; @@ -744,9 +743,9 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { static struct list_head *alloc_workspace(int type, unsigned int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); - case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(); case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); default: /* @@ -1030,6 +1029,7 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping { int type = btrfs_compress_type(type_level); int level = btrfs_compress_level(type_level); + const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; @@ -1037,6 +1037,8 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping workspace = get_workspace(type, level); ret = compression_compress_pages(type, workspace, mapping, start, folios, out_folios, total_in, total_out); + /* The total read-in bytes should be no larger than the input. */ + ASSERT(*total_in <= orig_len); put_workspace(type, workspace); return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index b6563b6a333e..954034086d0d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -175,7 +175,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *lzo_alloc_workspace(unsigned int level); +struct list_head *lzo_alloc_workspace(void); void lzo_free_workspace(struct list_head *ws); int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0cc919d15b14..148648ea1c8b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1508,26 +1508,26 @@ static noinline void unlock_up(struct btrfs_path *path, int level, */ static int read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer **eb_ret, int level, int slot, + struct extent_buffer **eb_ret, int slot, const struct btrfs_key *key) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; - u64 gen; - struct extent_buffer *tmp; - int ret; + struct extent_buffer *tmp = NULL; + int ret = 0; int parent_level; - bool unlock_up; + int err; + bool read_tmp = false; + bool tmp_locked = false; + bool path_released = false; - unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); - gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); check.has_first_key = true; check.level = parent_level - 1; - check.transid = gen; + check.transid = btrfs_node_ptr_generation(*eb_ret, slot); check.owner_root = btrfs_root_id(root); /* @@ -1540,79 +1540,115 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS) - reada_for_search(fs_info, p, level, slot, key->objectid); + reada_for_search(fs_info, p, parent_level, slot, key->objectid); /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple * parents (shared tree blocks). */ - if (btrfs_verify_level_key(tmp, - parent_level - 1, &check.first_key, gen)) { - free_extent_buffer(tmp); - return -EUCLEAN; + if (btrfs_verify_level_key(tmp, &check)) { + ret = -EUCLEAN; + goto out; } *eb_ret = tmp; - return 0; + tmp = NULL; + ret = 0; + goto out; } if (p->nowait) { - free_extent_buffer(tmp); - return -EAGAIN; + ret = -EAGAIN; + goto out; } - if (unlock_up) - btrfs_unlock_up_safe(p, level + 1); - - /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_extent_buffer(tmp, &check); - if (ret) { - free_extent_buffer(tmp); + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return ret; + ret = -EAGAIN; + path_released = true; } - if (unlock_up) - ret = -EAGAIN; + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; + } + if (ret == 0) { + ASSERT(!tmp_locked); + *eb_ret = tmp; + tmp = NULL; + } goto out; } else if (p->nowait) { - return -EAGAIN; + ret = -EAGAIN; + goto out; } - if (unlock_up) { - btrfs_unlock_up_safe(p, level + 1); + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); ret = -EAGAIN; - } else { - ret = 0; } if (p->reada != READA_NONE) - reada_for_search(fs_info, p, level, slot, key->objectid); + reada_for_search(fs_info, p, parent_level, slot, key->objectid); - tmp = read_tree_block(fs_info, blocknr, &check); + tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + tmp = NULL; + goto out; + } + read_tmp = true; + + if (!p->skip_locking) { + ASSERT(ret == -EAGAIN); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return PTR_ERR(tmp); + path_released = true; + } + + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; } + /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!extent_buffer_uptodate(tmp)) + if (!extent_buffer_uptodate(tmp)) { ret = -EIO; + goto out; + } -out: if (ret == 0) { + ASSERT(!tmp_locked); *eb_ret = tmp; - } else { - free_extent_buffer(tmp); - btrfs_release_path(p); + tmp = NULL; + } +out: + if (tmp) { + if (tmp_locked) + btrfs_tree_read_unlock(tmp); + if (read_tmp && ret && ret != -EAGAIN) + free_extent_buffer_stale(tmp); + else + free_extent_buffer(tmp); } + if (ret && !path_released) + btrfs_release_path(p); return ret; } @@ -2197,8 +2233,8 @@ cow_done: goto done; } - err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + err = read_block_for_search(root, p, &b, slot, key); + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err; @@ -2324,8 +2360,8 @@ again: goto done; } - err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + err = read_block_for_search(root, p, &b, slot, key); + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err; @@ -2334,7 +2370,7 @@ again: level = btrfs_header_level(b); btrfs_tree_read_lock(b); - b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); + b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); if (!b) { ret = -ENOMEM; goto done; @@ -4930,8 +4966,7 @@ again: } next = c; - ret = read_block_for_search(root, path, &next, level, - slot, &key); + ret = read_block_for_search(root, path, &next, slot, &key); if (ret == -EAGAIN && !path->nowait) goto again; @@ -4974,8 +5009,7 @@ again: if (!level) break; - ret = read_block_for_search(root, path, &next, level, - 0, &key); + ret = read_block_for_search(root, path, &next, 0, &key); if (ret == -EAGAIN && !path->nowait) goto again; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 317a3712270f..307dedf95c70 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -744,16 +744,11 @@ const char *btrfs_super_csum_driver(u16 csum_type); size_t __attribute_const__ btrfs_get_num_csums(void); /* - * We use page status Private2 to indicate there is an ordered extent with + * We use folio flag owner_2 to indicate there is an ordered extent with * unfinished IO. - * - * Rename the Private2 accessors to Ordered, to improve readability. */ -#define PageOrdered(page) PagePrivate2(page) -#define SetPageOrdered(page) SetPagePrivate2(page) -#define ClearPageOrdered(page) ClearPagePrivate2(page) -#define folio_test_ordered(folio) folio_test_private_2(folio) -#define folio_set_ordered(folio) folio_set_private_2(folio) -#define folio_clear_ordered(folio) folio_clear_private_2(folio) +#define folio_test_ordered(folio) folio_test_owner_2(folio) +#define folio_set_ordered(folio) folio_set_owner_2(folio) +#define folio_clear_ordered(folio) folio_clear_owner_2(folio) #endif diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index b95ef44c326b..968dae953948 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -763,12 +763,12 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * We can get a merged extent, in that case, we need to re-search * tree to get the original em for defrag. * - * If @newer_than is 0 or em::generation < newer_than, we can trust - * this em, as either we don't care about the generation, or the - * merged extent map will be rejected anyway. + * This is because even if we have adjacent extents that are contiguous + * and compatible (same type and flags), we still want to defrag them + * so that we use less metadata (extent items in the extent tree and + * file extent items in the inode's subvolume tree). */ - if (em && (em->flags & EXTENT_FLAG_MERGED) && - newer_than && em->generation >= newer_than) { + if (em && (em->flags & EXTENT_FLAG_MERGED)) { free_extent_map(em); em = NULL; } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 7cfefdfe54ea..f4d9feac0d0e 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -64,9 +64,9 @@ struct btrfs_delayed_node { struct mutex mutex; struct btrfs_inode_item inode_item; refcount_t refs; + int count; u64 index_cnt; unsigned long flags; - int count; /* * The size of the next batch of dir index items to insert (if this * node is from a directory inode). Protected by @mutex. diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 32f719b9e661..0d878dbbabba 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -9,6 +9,7 @@ #include "messages.h" #include "ctree.h" #include "delayed-ref.h" +#include "extent-tree.h" #include "transaction.h" #include "qgroup.h" #include "space-info.h" @@ -298,7 +299,7 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, if (ref1->ref_root < ref2->ref_root) return -1; if (ref1->ref_root > ref2->ref_root) - return -1; + return 1; if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) ret = comp_data_refs(ref1, ref2); } @@ -313,39 +314,6 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, return 0; } -/* insert a new ref to head ref rbtree */ -static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_root.rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_head *entry; - struct btrfs_delayed_ref_head *ins; - u64 bytenr; - bool leftmost = true; - - ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); - bytenr = ins->bytenr; - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, - href_node); - - if (bytenr < entry->bytenr) { - p = &(*p)->rb_left; - } else if (bytenr > entry->bytenr) { - p = &(*p)->rb_right; - leftmost = false; - } else { - return entry; - } - } - - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); - return NULL; -} - static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { @@ -380,75 +348,32 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, static struct btrfs_delayed_ref_head *find_first_ref_head( struct btrfs_delayed_ref_root *dr) { - struct rb_node *n; - struct btrfs_delayed_ref_head *entry; - - n = rb_first_cached(&dr->href_root); - if (!n) - return NULL; + unsigned long from = 0; - entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); + lockdep_assert_held(&dr->lock); - return entry; + return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT); } -/* - * Find a head entry based on bytenr. This returns the delayed ref head if it - * was able to find one, or NULL if nothing was in that spot. If return_bigger - * is given, the next bigger entry is returned if no exact match is found. - */ -static struct btrfs_delayed_ref_head *find_ref_head( - struct btrfs_delayed_ref_root *dr, u64 bytenr, - bool return_bigger) -{ - struct rb_root *root = &dr->href_root.rb_root; - struct rb_node *n; - struct btrfs_delayed_ref_head *entry; - - n = root->rb_node; - entry = NULL; - while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - - if (bytenr < entry->bytenr) - n = n->rb_left; - else if (bytenr > entry->bytenr) - n = n->rb_right; - else - return entry; - } - if (entry && return_bigger) { - if (bytenr > entry->bytenr) { - n = rb_next(&entry->href_node); - if (!n) - return NULL; - entry = rb_entry(n, struct btrfs_delayed_ref_head, - href_node); - } - return entry; - } - return NULL; -} - -int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) +static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) { lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) - return 0; + return true; refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); spin_lock(&delayed_refs->lock); - if (RB_EMPTY_NODE(&head->href_node)) { + if (!head->tracked) { mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); - return -EAGAIN; + return false; } btrfs_put_delayed_ref_head(head); - return 0; + return true; } static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, @@ -462,7 +387,6 @@ static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, if (!list_empty(&ref->add_list)) list_del(&ref->add_list); btrfs_put_delayed_ref(ref); - atomic_dec(&delayed_refs->num_entries); btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } @@ -558,33 +482,31 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) } struct btrfs_delayed_ref_head *btrfs_select_ref_head( + const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; + unsigned long start_index; + unsigned long found_index; + bool found_head = false; + bool locked; - lockdep_assert_held(&delayed_refs->lock); + spin_lock(&delayed_refs->lock); again: - head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, - true); - if (!head && delayed_refs->run_delayed_start != 0) { - delayed_refs->run_delayed_start = 0; - head = find_first_ref_head(delayed_refs); + start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits); + xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) { + if (!head->processing) { + found_head = true; + break; + } } - if (!head) - return NULL; - - while (head->processing) { - struct rb_node *node; - - node = rb_next(&head->href_node); - if (!node) { - if (delayed_refs->run_delayed_start == 0) - return NULL; - delayed_refs->run_delayed_start = 0; - goto again; + if (!found_head) { + if (delayed_refs->run_delayed_start == 0) { + spin_unlock(&delayed_refs->lock); + return NULL; } - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); + delayed_refs->run_delayed_start = 0; + goto again; } head->processing = true; @@ -592,18 +514,42 @@ again: delayed_refs->num_heads_ready--; delayed_refs->run_delayed_start = head->bytenr + head->num_bytes; + + locked = btrfs_delayed_ref_lock(delayed_refs, head); + spin_unlock(&delayed_refs->lock); + + /* + * We may have dropped the spin lock to get the head mutex lock, and + * that might have given someone else time to free the head. If that's + * true, it has been removed from our list and we can move on. + */ + if (!locked) + return ERR_PTR(-EAGAIN); + return head; } -void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + spin_lock(&delayed_refs->lock); + head->processing = false; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(head); +} + +void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { + const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits); + lockdep_assert_held(&delayed_refs->lock); lockdep_assert_held(&head->lock); - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); - atomic_dec(&delayed_refs->num_entries); + xa_erase(&delayed_refs->head_refs, index); + head->tracked = false; delayed_refs->num_heads--; if (!head->processing) delayed_refs->num_heads_ready--; @@ -629,7 +575,6 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, if (!exist) { if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); - atomic_inc(&root->num_entries); spin_unlock(&href->lock); trans->delayed_ref_updates++; return false; @@ -649,7 +594,7 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, &href->ref_add_list); else if (ref->action == BTRFS_DROP_DELAYED_REF) { ASSERT(!list_empty(&exist->add_list)); - list_del(&exist->add_list); + list_del_init(&exist->add_list); } else { ASSERT(0); } @@ -813,7 +758,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); - RB_CLEAR_NODE(&head_ref->href_node); + head_ref->tracked = false; head_ref->processing = false; head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); @@ -830,7 +775,6 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, qrecord->data_rsv = reserved; qrecord->data_rsv_refroot = generic_ref->ref_root; } - qrecord->bytenr = generic_ref->bytenr; qrecord->num_bytes = generic_ref->num_bytes; qrecord->old_roots = NULL; } @@ -849,21 +793,36 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord, int action, bool *qrecord_inserted_ret) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; + const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits); bool qrecord_inserted = false; delayed_refs = &trans->transaction->delayed_refs; + lockdep_assert_held(&delayed_refs->lock); + +#if BITS_PER_LONG == 32 + if (head_ref->bytenr >= MAX_LFS_FILESIZE) { + if (qrecord) + xa_release(&delayed_refs->dirty_extents, index); + btrfs_err_rl(fs_info, +"delayed ref head %llu is beyond 32bit page cache and xarray index limit", + head_ref->bytenr); + btrfs_err_32bit_limit(fs_info); + return ERR_PTR(-EOVERFLOW); + } +#endif /* Record qgroup extent info if provided */ if (qrecord) { int ret; - ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info, - delayed_refs, qrecord); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, + head_ref->bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ - xa_release(&delayed_refs->dirty_extents, qrecord->bytenr); + xa_release(&delayed_refs->dirty_extents, index); /* Caller responsible for freeing qrecord on error. */ if (ret < 0) return ERR_PTR(ret); @@ -873,10 +832,9 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } } - trace_add_delayed_ref_head(trans->fs_info, head_ref, action); + trace_add_delayed_ref_head(fs_info, head_ref, action); - existing = htree_insert(&delayed_refs->href_root, - &head_ref->href_node); + existing = xa_load(&delayed_refs->head_refs, index); if (existing) { update_existing_head_ref(trans, existing, head_ref); /* @@ -886,6 +844,19 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC); + if (xa_is_err(existing)) { + /* Memory was preallocated by the caller. */ + ASSERT(xa_err(existing) != -ENOMEM); + return ERR_PTR(xa_err(existing)); + } else if (WARN_ON(existing)) { + /* + * Shouldn't happen we just did a lookup before under + * delayed_refs->lock. + */ + return ERR_PTR(-EEXIST); + } + head_ref->tracked = true; /* * We reserve the amount of bytes needed to delete csums when * adding the ref head and not when adding individual drop refs @@ -895,12 +866,10 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_csum_deletions += - btrfs_csum_bytes_to_leaves(trans->fs_info, - head_ref->num_bytes); + btrfs_csum_bytes_to_leaves(fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; - atomic_inc(&delayed_refs->num_entries); } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; @@ -1008,6 +977,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *new_head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; + const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits); + bool qrecord_reserved = false; bool qrecord_inserted; int action = generic_ref->action; bool merged; @@ -1023,24 +994,32 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, goto free_node; } + delayed_refs = &trans->transaction->delayed_refs; + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { ret = -ENOMEM; goto free_head_ref; } - if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, - generic_ref->bytenr, GFP_NOFS)) { + if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { ret = -ENOMEM; goto free_record; } + qrecord_reserved = true; + } + + ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); + if (ret) { + if (qrecord_reserved) + xa_release(&delayed_refs->dirty_extents, index); + goto free_record; } init_delayed_ref_common(fs_info, node, generic_ref); init_delayed_ref_head(head_ref, generic_ref, record, reserved); head_ref->extent_op = extent_op; - delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); /* @@ -1050,6 +1029,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, new_head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); if (IS_ERR(new_head_ref)) { + xa_release(&delayed_refs->head_refs, index); spin_unlock(&delayed_refs->lock); ret = PTR_ERR(new_head_ref); goto free_record; @@ -1073,7 +1053,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_node_cachep, node); if (qrecord_inserted) - return btrfs_qgroup_trace_extent_post(trans, record); + return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); return 0; free_record: @@ -1112,6 +1092,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u8 level, struct btrfs_delayed_extent_op *extent_op) { + const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits); struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_head *head_ref_ret; struct btrfs_delayed_ref_root *delayed_refs; @@ -1122,6 +1103,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, .num_bytes = num_bytes, .tree_ref.level = level, }; + int ret; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) @@ -1131,16 +1113,23 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); + ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); + if (ret) { + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + return ret; + } + + spin_lock(&delayed_refs->lock); head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, NULL); - spin_unlock(&delayed_refs->lock); - if (IS_ERR(head_ref_ret)) { + xa_release(&delayed_refs->head_refs, index); + spin_unlock(&delayed_refs->lock); kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); return PTR_ERR(head_ref_ret); } + spin_unlock(&delayed_refs->lock); /* * Need to update the delayed_refs_rsv with any changes we may have @@ -1163,11 +1152,15 @@ void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) * head node if found, or NULL if not. */ struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) +btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + u64 bytenr) { + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); + lockdep_assert_held(&delayed_refs->lock); - return find_ref_head(delayed_refs, bytenr, false); + return xa_load(&delayed_refs->head_refs, index); } static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) @@ -1237,6 +1230,81 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, return found; } +void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + + spin_lock(&delayed_refs->lock); + while (true) { + struct btrfs_delayed_ref_head *head; + struct rb_node *n; + bool pin_bytes = false; + + head = find_first_ref_head(delayed_refs); + if (!head) + break; + + if (!btrfs_delayed_ref_lock(delayed_refs, head)) + continue; + + spin_lock(&head->lock); + while ((n = rb_first_cached(&head->ref_tree)) != NULL) { + struct btrfs_delayed_ref_node *ref; + + ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); + drop_delayed_ref(fs_info, delayed_refs, head, ref); + } + if (head->must_insert_reserved) + pin_bytes = true; + btrfs_free_delayed_extent_op(head->extent_op); + btrfs_delete_ref_head(fs_info, delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + mutex_unlock(&head->mutex); + + if (pin_bytes) { + struct btrfs_block_group *bg; + + bg = btrfs_lookup_block_group(fs_info, head->bytenr); + if (WARN_ON_ONCE(bg == NULL)) { + /* + * Unexpected and there's nothing we can do here + * because we are in a transaction abort path, + * so any errors can only be ignored or reported + * while attempting to cleanup all resources. + */ + btrfs_err(fs_info, +"block group for delayed ref at %llu was not found while destroying ref head", + head->bytenr); + } else { + spin_lock(&bg->space_info->lock); + spin_lock(&bg->lock); + bg->pinned += head->num_bytes; + btrfs_space_info_update_bytes_pinned(fs_info, + bg->space_info, + head->num_bytes); + bg->reserved -= head->num_bytes; + bg->space_info->bytes_reserved -= head->num_bytes; + spin_unlock(&bg->lock); + spin_unlock(&bg->space_info->lock); + + btrfs_put_block_group(bg); + } + + btrfs_error_unpin_extent_range(fs_info, head->bytenr, + head->bytenr + head->num_bytes - 1); + } + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + btrfs_put_delayed_ref_head(head); + cond_resched(); + spin_lock(&delayed_refs->lock); + } + btrfs_qgroup_destroy_extent_records(trans); + + spin_unlock(&delayed_refs->lock); +} + void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 085f30968aba..611fb3388f82 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -61,7 +61,8 @@ struct btrfs_delayed_ref_node { /* * If action is BTRFS_ADD_DELAYED_REF, also link this node to * ref_head->ref_add_list, then we do not need to iterate the - * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes. + * refs rbtree in the corresponding delayed ref head + * (struct btrfs_delayed_ref_head::ref_tree). */ struct list_head add_list; @@ -123,12 +124,6 @@ struct btrfs_delayed_ref_head { u64 bytenr; u64 num_bytes; /* - * For insertion into struct btrfs_delayed_ref_root::href_root. - * Keep it in the same cache line as 'bytenr' for more efficient - * searches in the rbtree. - */ - struct rb_node href_node; - /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. */ @@ -191,6 +186,11 @@ struct btrfs_delayed_ref_head { bool is_data; bool is_system; bool processing; + /* + * Indicate if it's currently in the data structure that tracks head + * refs (struct btrfs_delayed_ref_root::head_refs). + */ + bool tracked; }; enum btrfs_delayed_ref_flags { @@ -199,30 +199,52 @@ enum btrfs_delayed_ref_flags { }; struct btrfs_delayed_ref_root { - /* head ref rbtree */ - struct rb_root_cached href_root; + /* + * Track head references. + * The keys correspond to the logical address of the extent ("bytenr") + * right shifted by fs_info->sectorsize_bits. This is both to get a more + * dense index space (optimizes xarray structure) and because indexes in + * xarrays are of "unsigned long" type, meaning they are 32 bits wide on + * 32 bits platforms, limiting the extent range to 4G which is too low + * and makes it unusable (truncated index values) on 32 bits platforms. + * Protected by the spinlock 'lock' defined below. + */ + struct xarray head_refs; - /* Track dirty extent records. */ + /* + * Track dirty extent records. + * The keys correspond to the logical address of the extent ("bytenr") + * right shifted by fs_info->sectorsize_bits, for same reasons as above. + */ struct xarray dirty_extents; - /* this spin lock protects the rbtree and the entries inside */ - spinlock_t lock; - - /* how many delayed ref updates we've queued, used by the - * throttling code + /* + * Protects the xarray head_refs, its entries and the following fields: + * num_heads, num_heads_ready, pending_csums and run_delayed_start. */ - atomic_t num_entries; + spinlock_t lock; - /* total number of head nodes in tree */ + /* Total number of head refs, protected by the spinlock 'lock'. */ unsigned long num_heads; - /* total number of head nodes ready for processing */ + /* + * Total number of head refs ready for processing, protected by the + * spinlock 'lock'. + */ unsigned long num_heads_ready; + /* + * Track space reserved for deleting csums of data extents. + * Protected by the spinlock 'lock'. + */ u64 pending_csums; unsigned long flags; + /* + * Track from which bytenr to start searching ref heads. + * Protected by the spinlock 'lock'. + */ u64 run_delayed_start; /* @@ -364,19 +386,22 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr); -int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head); static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { mutex_unlock(&head->mutex); } -void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head *btrfs_select_ref_head( + const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs); +void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); @@ -391,6 +416,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, u64 root, u64 parent); +void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans); static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 83d5cdd77f29..ac8e97ed13f7 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -45,7 +45,7 @@ * * - Copy existing extents * - * This happens by re-using scrub facility, as scrub also iterates through + * This happens by reusing scrub facility, as scrub also iterates through * existing extents from commit root. * * Location: scrub_write_block_to_dev_replace() from @@ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; down_write(&dev_replace->rwsem); + dev_replace->replace_task = current; switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -994,6 +995,7 @@ error: list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); fs_devices->rw_devices++; + dev_replace->replace_task = NULL; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 001c0c2f872c..1ea5d8fcfbf7 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle const char *name, int name_len) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *ptr; struct extent_buffer *leaf; @@ -35,7 +34,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (ret == -EEXIST) { struct btrfs_dir_item *di; - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); if (di) return ERR_PTR(-EEXIST); btrfs_extend_item(trans, path, data_size); @@ -190,7 +189,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( if (ret > 0) return ERR_PTR(-ENOENT); - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return btrfs_match_dir_item_name(path, name, name_len); } /* @@ -341,14 +340,13 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; - di = btrfs_match_dir_item_name(root->fs_info, path, - name->name, name->len); + di = btrfs_match_dir_item_name(path, name->name, name->len); if (di) return di; } /* Adjust return code if the key was not found in the next leaf. */ - if (ret > 0) - ret = 0; + if (ret >= 0) + ret = -ENOENT; return ERR_PTR(ret); } @@ -378,8 +376,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - const struct btrfs_path *path, +struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, const char *name, int name_len) { struct btrfs_dir_item *dir_item; diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 5f6dfafc91f1..28d69970bc70 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -44,8 +44,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - const struct btrfs_path *path, +struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, const char *name, int name_len); diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index bd38df5647e3..a7c3e221378d 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -834,7 +834,7 @@ relock: return ret; } - ret = btrfs_write_check(iocb, from, ret); + ret = btrfs_write_check(iocb, ret); if (ret < 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4ad5db619b00..814320948645 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -917,8 +917,7 @@ fail: return ERR_PTR(ret); } -static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; @@ -966,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, { struct btrfs_root *log_root; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -992,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item; int ret; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -1959,7 +1958,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2786,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_scrub(fs_info); btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); + btrfs_init_extent_map_shrinker_work(fs_info); rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; @@ -2852,8 +2852,6 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; - spin_lock_init(&fs_info->extent_map_shrinker_lock); - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; @@ -3202,8 +3200,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) return 0; } -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - const char *options) +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { u32 sectorsize; u32 nodesize; @@ -4186,7 +4183,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "transaction %llu (with %llu dirty metadata bytes) is not committed", trans->transid, dirty_bytes); - btrfs_cleanup_one_transaction(trans, fs_info); + btrfs_cleanup_one_transaction(trans); if (trans == fs_info->running_transaction) fs_info->running_transaction = NULL; @@ -4294,6 +4291,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->em_shrinker_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4531,75 +4529,6 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); } -static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info) -{ - struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; - struct btrfs_delayed_ref_node *ref; - - spin_lock(&delayed_refs->lock); - while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { - struct btrfs_delayed_ref_head *head; - struct rb_node *n; - bool pin_bytes = false; - - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - if (btrfs_delayed_ref_lock(delayed_refs, head)) - continue; - - spin_lock(&head->lock); - while ((n = rb_first_cached(&head->ref_tree)) != NULL) { - ref = rb_entry(n, struct btrfs_delayed_ref_node, - ref_node); - rb_erase_cached(&ref->ref_node, &head->ref_tree); - RB_CLEAR_NODE(&ref->ref_node); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - atomic_dec(&delayed_refs->num_entries); - btrfs_put_delayed_ref(ref); - btrfs_delayed_refs_rsv_release(fs_info, 1, 0); - } - if (head->must_insert_reserved) - pin_bytes = true; - btrfs_free_delayed_extent_op(head->extent_op); - btrfs_delete_ref_head(delayed_refs, head); - spin_unlock(&head->lock); - spin_unlock(&delayed_refs->lock); - mutex_unlock(&head->mutex); - - if (pin_bytes) { - struct btrfs_block_group *cache; - - cache = btrfs_lookup_block_group(fs_info, head->bytenr); - BUG_ON(!cache); - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += head->num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, - cache->space_info, head->num_bytes); - cache->reserved -= head->num_bytes; - cache->space_info->bytes_reserved -= head->num_bytes; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - btrfs_put_block_group(cache); - - btrfs_error_unpin_extent_range(fs_info, head->bytenr, - head->bytenr + head->num_bytes - 1); - } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); - btrfs_put_delayed_ref_head(head); - cond_resched(); - spin_lock(&delayed_refs->lock); - } - btrfs_qgroup_destroy_extent_records(trans); - - spin_unlock(&delayed_refs->lock); -} - static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; @@ -4805,9 +4734,9 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->fs_roots_radix_lock); } -void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans) { + struct btrfs_fs_info *fs_info = cur_trans->fs_info; struct btrfs_device *dev, *tmp; btrfs_cleanup_dirty_bgs(cur_trans, fs_info); @@ -4819,7 +4748,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, list_del_init(&dev->post_commit_list); } - btrfs_destroy_delayed_refs(cur_trans, fs_info); + btrfs_destroy_delayed_refs(cur_trans); cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); @@ -4865,7 +4794,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) } else { spin_unlock(&fs_info->trans_lock); } - btrfs_cleanup_one_transaction(t, fs_info); + btrfs_cleanup_one_transaction(t); spin_lock(&fs_info->trans_lock); if (t == fs_info->running_transaction) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 99af64d3f277..a7051e2570c1 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,8 +52,7 @@ struct extent_buffer *btrfs_find_create_tree_block( int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - const char *options); +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices); void __cold close_ctree(struct btrfs_fs_info *fs_info); int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); @@ -127,8 +126,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); -void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info); +void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d9f511babd89..412e318e4a22 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -182,7 +182,7 @@ search_again: delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -795,7 +795,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, if (insert) { extra_size = btrfs_extent_inline_ref_size(want); path->search_for_extension = 1; - path->keep_locks = 1; } else extra_size = -1; @@ -946,6 +945,25 @@ again: ret = -EAGAIN; goto out; } + + if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) { + struct btrfs_key tmp_key; + + btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1); + if (tmp_key.objectid == bytenr && + tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = -EAGAIN; + goto out; + } + goto out_no_entry; + } + + if (!path->keep_locks) { + btrfs_release_path(path); + path->keep_locks = 1; + goto again; + } + /* * To add new inline back ref, we have to make sure * there is no corresponding back ref item. @@ -959,13 +977,15 @@ again: goto out; } } +out_no_entry: *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: - if (insert) { + if (path->keep_locks) { path->keep_locks = 0; - path->search_for_extension = 0; btrfs_unlock_up_safe(path, 1); } + if (insert) + path->search_for_extension = 0; return ret; } @@ -1807,16 +1827,6 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) return ref; } -static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - spin_lock(&delayed_refs->lock); - head->processing = false; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(head); -} - static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { @@ -1891,7 +1901,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { - unselect_delayed_ref_head(delayed_refs, head); + btrfs_unselect_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); return ret; } else if (ret) { @@ -1910,7 +1920,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 1; } - btrfs_delete_ref_head(delayed_refs, head); + btrfs_delete_ref_head(fs_info, delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -1933,39 +1943,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, return ret; } -static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( - struct btrfs_trans_handle *trans) -{ - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; - struct btrfs_delayed_ref_head *head = NULL; - int ret; - - spin_lock(&delayed_refs->lock); - head = btrfs_select_ref_head(delayed_refs); - if (!head) { - spin_unlock(&delayed_refs->lock); - return head; - } - - /* - * Grab the lock that says we are going to process all the refs for - * this head - */ - ret = btrfs_delayed_ref_lock(delayed_refs, head); - spin_unlock(&delayed_refs->lock); - - /* - * We may have dropped the spin lock to get the head mutex lock, and - * that might have given someone else time to free the head. If that's - * true, it has been removed from our list and we can move on. - */ - if (ret == -EAGAIN) - head = ERR_PTR(-EAGAIN); - - return head; -} - static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *locked_ref, u64 *bytes_released) @@ -1986,7 +1963,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); return -EAGAIN; } @@ -2009,7 +1986,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, default: WARN_ON(1); } - atomic_dec(&delayed_refs->num_entries); /* * Record the must_insert_reserved flag before we drop the @@ -2035,7 +2011,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); return ret; } @@ -2073,7 +2049,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, do { if (!locked_ref) { - locked_ref = btrfs_obtain_ref_head(trans); + locked_ref = btrfs_select_ref_head(fs_info, delayed_refs); if (IS_ERR_OR_NULL(locked_ref)) { if (PTR_ERR(locked_ref) == -EAGAIN) { continue; @@ -2220,7 +2196,7 @@ again: btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { + if (xa_empty(&delayed_refs->head_refs)) { spin_unlock(&delayed_refs->lock); return 0; } @@ -2275,7 +2251,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, delayed_refs = &cur_trans->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) { spin_unlock(&delayed_refs->lock); btrfs_put_transaction(cur_trans); @@ -3144,7 +3120,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, break; } - /* Quick path didn't find the EXTEMT/METADATA_ITEM */ + /* Quick path didn't find the EXTENT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; @@ -3377,13 +3353,14 @@ out: static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, u64 bytenr) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (!head) goto out_delayed_unlock; @@ -3401,7 +3378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!mutex_trylock(&head->mutex)) goto out; - btrfs_delete_ref_head(delayed_refs, head); + btrfs_delete_ref_head(fs_info, delayed_refs, head); head->processing = false; spin_unlock(&head->lock); @@ -3411,7 +3388,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; - btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; @@ -5270,7 +5247,7 @@ struct walk_control { * corrupted file systems must have been caught before calling this function. */ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, - struct extent_buffer *eb, u64 refs, u64 flags, int slot) + struct extent_buffer *eb, u64 flags, int slot) { struct btrfs_key key; u64 generation; @@ -5384,7 +5361,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; /* If we don't need to visit this node don't reada. */ - if (!visit_node_for_delete(root, wc, eb, refs, flags, slot)) + if (!visit_node_for_delete(root, wc, eb, flags, slot)) continue; reada: btrfs_readahead_node_child(eb, slot); @@ -5518,7 +5495,7 @@ again: */ delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) goto out; if (!mutex_trylock(&head->mutex)) { @@ -5737,8 +5714,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, /* If we don't have to walk into this node skip it. */ if (!visit_node_for_delete(root, wc, path->nodes[level], - wc->refs[level - 1], wc->flags[level - 1], - path->slots[level])) + wc->flags[level - 1], path->slots[level])) goto skip; /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 309a8ae48434..b923d0cec61c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -190,7 +190,7 @@ static void process_one_folio(struct btrfs_fs_info *fs_info, btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) - btrfs_folio_end_writer_lock(fs_info, folio, start, len); + btrfs_folio_end_lock(fs_info, folio, start, len); } static void __process_folios_contig(struct address_space *mapping, @@ -262,22 +262,23 @@ static noinline int lock_delalloc_folios(struct inode *inode, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - u32 len = end + 1 - start; + u64 range_start; + u32 range_len; if (folio == locked_folio) continue; - if (btrfs_folio_start_writer_lock(fs_info, folio, start, - len)) - goto out; - + folio_lock(folio); if (!folio_test_dirty(folio) || folio->mapping != mapping) { - btrfs_folio_end_writer_lock(fs_info, folio, start, - len); + folio_unlock(folio); goto out; } + range_start = max_t(u64, folio_pos(folio), start); + range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + end + 1) - range_start; + btrfs_folio_set_lock(fs_info, folio, range_start, range_len); - processed_end = folio_pos(folio) + folio_size(folio) - 1; + processed_end = range_start + range_len - 1; } folio_batch_release(&fbatch); cond_resched(); @@ -437,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le if (!btrfs_is_subpage(fs_info, folio->mapping)) folio_unlock(folio); else - btrfs_subpage_end_reader(fs_info, folio, start, len); + btrfs_folio_end_lock(fs_info, folio, start, len); } /* @@ -494,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) return; ASSERT(folio_test_private(folio)); - btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); } /* @@ -785,7 +786,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page, + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); size -= len; @@ -1101,6 +1102,45 @@ int btrfs_read_folio(struct file *file, struct folio *folio) return ret; } +static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, + u64 start, u32 len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + unsigned int start_bit; + unsigned int nbits; + + ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + nbits = len >> fs_info->sectorsize_bits; + ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); + bitmap_set(delalloc_bitmap, start_bit, nbits); +} + +static bool find_next_delalloc_bitmap(struct folio *folio, + unsigned long *delalloc_bitmap, u64 start, + u64 *found_start, u32 *found_len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + const unsigned int bitmap_size = fs_info->sectors_per_page; + unsigned int start_bit; + unsigned int first_zero; + unsigned int first_set; + + ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); + if (first_set >= bitmap_size) + return false; + + *found_start = folio_start + (first_set << fs_info->sectorsize_bits); + first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); + *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; + return true; +} + /* * helper for extent_writepage(), doing all of the delayed allocation setup. * @@ -1120,6 +1160,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond * page boundary, thus we cannot rely on subpage bitmap to locate the @@ -1130,6 +1171,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_end = page_end; u64 delalloc_to_write = 0; int ret = 0; + int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { @@ -1139,6 +1181,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, bio_ctrl->submit_bitmap = 1; } + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + u64 start = page_start + (bit << fs_info->sectorsize_bits); + + btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); + } + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; @@ -1147,9 +1195,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, delalloc_start = delalloc_end + 1; continue; } - btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start, - min(delalloc_end, page_end) + 1 - - delalloc_start); + set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, + min(delalloc_end, page_end) + 1 - delalloc_start); last_delalloc_end = delalloc_end; delalloc_start = delalloc_end + 1; } @@ -1174,7 +1221,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, found_len = last_delalloc_end + 1 - found_start; found = true; } else { - found = btrfs_subpage_find_writer_locked(fs_info, folio, + found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, &found_start, &found_len); } if (!found) @@ -1313,7 +1360,7 @@ static int submit_one_sector(struct btrfs_inode *inode, * a folio for a range already written to disk. */ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); - btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); + btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); /* * Above call should set the whole folio with writeback flag, even * just for a single subpage sector. @@ -1390,8 +1437,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, goto out; submitted_io = true; } - - btrfs_folio_assert_not_dirty(fs_info, folio, start, len); out: /* * If we didn't submitted any sector (>= i_size), folio dirty get @@ -1475,7 +1520,7 @@ done: * Only unlock ranges that are submitted. As there can be some async * submitted ranges inside the folio. */ - btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); + btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); ASSERT(ret <= 0); return ret; } @@ -1707,7 +1752,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, ret = bio_add_folio(&bbio->bio, folio, eb->len, eb->start - folio_pos(folio)); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len); + wbc_account_cgroup_owner(wbc, folio, eb->len); folio_unlock(folio); } else { int num_folios = num_extent_folios(eb); @@ -1721,8 +1766,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_start_writeback(folio); ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), - eb->folio_size); + wbc_account_cgroup_owner(wbc, folio, eb->folio_size); wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } @@ -2115,7 +2159,27 @@ retry: continue; } - if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * For subpage case, compression can lead to mixed + * writeback and dirty flags, e.g: + * 0 32K 64K 96K 128K + * | |//////||/////| |//| + * + * In above case, [32K, 96K) is asynchronously submitted + * for compression, and [124K, 128K) needs to be written back. + * + * If we didn't wait wrtiteback for page 64K, [128K, 128K) + * won't be submitted as the page still has writeback flag + * and will be skipped in the next check. + * + * This mixed writeback and dirty case is only possible for + * subpage case. + * + * TODO: Remove this check after migrating compression to + * regular submission. + */ + if (wbc->sync_mode != WB_SYNC_NONE || + btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2200,7 +2264,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f u32 cur_len = cur_end + 1 - cur; struct folio *folio; - folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); + folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); /* * This shouldn't happen, the pages are pinned and locked, this @@ -2233,7 +2297,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f cur, cur_len, !ret); mapping_set_error(mapping, ret); } - btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len); + btrfs_folio_end_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: @@ -2317,7 +2381,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * to drop the page. */ static bool try_release_extent_state(struct extent_io_tree *tree, - struct folio *folio, gfp_t mask) + struct folio *folio) { u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; @@ -2428,7 +2492,7 @@ next: cond_resched(); } } - return try_release_extent_state(io_tree, folio, mask); + return try_release_extent_state(io_tree, folio); } static void __free_extent_buffer(struct extent_buffer *eb) @@ -2442,7 +2506,7 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio) +static bool folio_range_has_eb(struct folio *folio) { struct btrfs_subpage *subpage; @@ -2452,12 +2516,6 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli subpage = folio_get_private(folio); if (atomic_read(&subpage->eb_refs)) return true; - /* - * Even there is no eb refs here, we may still have - * end_folio_read() call relying on page::private. - */ - if (atomic_read(&subpage->readers)) - return true; } return false; } @@ -2516,7 +2574,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * We can only detach the folio private if there are no other ebs in the * page range and no unfinished IO. */ - if (!folio_range_has_eb(fs_info, folio)) + if (!folio_range_has_eb(folio)) btrfs_detach_subpage(fs_info, folio); spin_unlock(&folio->mapping->i_private_lock); @@ -3121,7 +3179,7 @@ out: } /* * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utlizing page->mapping. + * so it can be cleaned up without utilizing page->mapping. */ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -4221,7 +4279,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level) { struct btrfs_tree_parent_check check = { - .has_first_key = 0, .level = level, .transid = gen }; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 25d191f1ac10..67ce85ff0ae2 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -77,10 +77,13 @@ static u64 range_end(u64 start, u64 len) return start + len; } -static void dec_evictable_extent_maps(struct btrfs_inode *inode) +static void remove_em(struct btrfs_inode *inode, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + rb_erase(&em->rb_node, &inode->extent_tree.root); + RB_CLEAR_NODE(&em->rb_node); + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) percpu_counter_dec(&fs_info->evictable_extent_maps); } @@ -230,7 +233,12 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma if (extent_map_end(prev) != next->start) return false; - if (prev->flags != next->flags) + /* + * The merged flag is not an on-disk flag, it just indicates we had the + * extent maps of 2 (or more) adjacent extents merged, so factor it out. + */ + if ((prev->flags & ~EXTENT_FLAG_MERGED) != + (next->flags & ~EXTENT_FLAG_MERGED)) return false; if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) @@ -243,13 +251,19 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma /* * Handle the on-disk data extents merge for @prev and @next. * + * @prev: left extent to merge + * @next: right extent to merge + * @merged: the extent we will not discard after the merge; updated with new values + * + * After this, one of the two extents is the new merged extent and the other is + * removed from the tree and likely freed. Note that @merged is one of @prev/@next + * so there is const/non-const aliasing occurring here. + * * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes. * For now only uncompressed regular extent can be merged. - * - * @prev and @next will be both updated to point to the new merged range. - * Thus one of them should be removed by the caller. */ -static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next) +static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next, + struct extent_map *merged) { u64 new_disk_bytenr; u64 new_disk_num_bytes; @@ -284,15 +298,10 @@ static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *nex new_disk_bytenr; new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr; - prev->disk_bytenr = new_disk_bytenr; - prev->disk_num_bytes = new_disk_num_bytes; - prev->ram_bytes = new_disk_num_bytes; - prev->offset = new_offset; - - next->disk_bytenr = new_disk_bytenr; - next->disk_num_bytes = new_disk_num_bytes; - next->ram_bytes = new_disk_num_bytes; - next->offset = new_offset; + merged->disk_bytenr = new_disk_bytenr; + merged->disk_num_bytes = new_disk_num_bytes; + merged->ram_bytes = new_disk_num_bytes; + merged->offset = new_offset; } static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix, @@ -333,7 +342,6 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_map_tree *tree = &inode->extent_tree; struct extent_map *merge = NULL; struct rb_node *rb; @@ -361,14 +369,12 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) em->generation = max(em->generation, merge->generation); if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) - merge_ondisk_extents(merge, em); + merge_ondisk_extents(merge, em, em); em->flags |= EXTENT_FLAG_MERGED; validate_extent_map(fs_info, em); - rb_erase(&merge->rb_node, &tree->root); - RB_CLEAR_NODE(&merge->rb_node); + remove_em(inode, merge); free_extent_map(merge); - dec_evictable_extent_maps(inode); } } @@ -378,14 +384,12 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) - merge_ondisk_extents(em, merge); + merge_ondisk_extents(em, merge, em); validate_extent_map(fs_info, em); - rb_erase(&merge->rb_node, &tree->root); - RB_CLEAR_NODE(&merge->rb_node); em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; + remove_em(inode, merge); free_extent_map(merge); - dec_evictable_extent_maps(inode); } } @@ -582,12 +586,10 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) lockdep_assert_held_write(&tree->lock); WARN_ON(em->flags & EXTENT_FLAG_PINNED); - rb_erase(&em->rb_node, &tree->root); if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); - RB_CLEAR_NODE(&em->rb_node); - dec_evictable_extent_maps(inode); + remove_em(inode, em); } static void replace_extent_mapping(struct btrfs_inode *inode, @@ -1116,13 +1118,12 @@ out_free_pre: struct btrfs_em_shrink_ctx { long nr_to_scan; long scanned; - u64 last_ino; - u64 last_root; }; static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) { - const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); struct extent_map_tree *tree = &inode->extent_tree; long nr_dropped = 0; struct rb_node *node; @@ -1195,7 +1196,8 @@ next: * lock. This is to avoid slowing other tasks trying to take the * lock. */ - if (need_resched() || rwlock_needbreak(&tree->lock)) + if (need_resched() || rwlock_needbreak(&tree->lock) || + btrfs_fs_closing(fs_info)) break; node = next; } @@ -1207,19 +1209,21 @@ next: static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; long nr_dropped = 0; - u64 min_ino = ctx->last_ino + 1; + u64 min_ino = fs_info->em_shrinker_last_ino + 1; inode = btrfs_find_first_inode(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); min_ino = btrfs_ino(inode) + 1; - ctx->last_ino = btrfs_ino(inode); + fs_info->em_shrinker_last_ino = btrfs_ino(inode); btrfs_add_delayed_iput(inode); - if (ctx->scanned >= ctx->nr_to_scan) + if (ctx->scanned >= ctx->nr_to_scan || + btrfs_fs_closing(inode->root->fs_info)) break; cond_resched(); @@ -1235,52 +1239,43 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx * inode if there is one or we will find out this was the last * one and move to the next root. */ - ctx->last_root = btrfs_root_id(root); + fs_info->em_shrinker_last_root = btrfs_root_id(root); } else { /* * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so * that when processing the next root we start from its first inode. */ - ctx->last_ino = 0; - ctx->last_root = btrfs_root_id(root) + 1; + fs_info->em_shrinker_last_ino = 0; + fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1; } return nr_dropped; } -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +static void btrfs_extent_map_shrinker_worker(struct work_struct *work) { + struct btrfs_fs_info *fs_info; struct btrfs_em_shrink_ctx ctx; u64 start_root_id; u64 next_root_id; bool cycled = false; long nr_dropped = 0; - ctx.scanned = 0; - ctx.nr_to_scan = nr_to_scan; + fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work); - /* - * In case we have multiple tasks running this shrinker, make the next - * one start from the next inode in case it starts before we finish. - */ - spin_lock(&fs_info->extent_map_shrinker_lock); - ctx.last_ino = fs_info->extent_map_shrinker_last_ino; - fs_info->extent_map_shrinker_last_ino++; - ctx.last_root = fs_info->extent_map_shrinker_last_root; - spin_unlock(&fs_info->extent_map_shrinker_lock); + ctx.scanned = 0; + ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan); - start_root_id = ctx.last_root; - next_root_id = ctx.last_root; + start_root_id = fs_info->em_shrinker_last_root; + next_root_id = fs_info->em_shrinker_last_root; if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, - nr, ctx.last_root, - ctx.last_ino); + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr); } - while (ctx.scanned < ctx.nr_to_scan) { + while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { struct btrfs_root *root; unsigned long count; @@ -1294,8 +1289,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) spin_unlock(&fs_info->fs_roots_radix_lock); if (start_root_id > 0 && !cycled) { next_root_id = 0; - ctx.last_root = 0; - ctx.last_ino = 0; + fs_info->em_shrinker_last_root = 0; + fs_info->em_shrinker_last_ino = 0; cycled = true; continue; } @@ -1314,29 +1309,40 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) btrfs_put_root(root); } - /* - * In case of multiple tasks running this extent map shrinking code this - * isn't perfect but it's simple and silences things like KCSAN. It's - * not possible to know which task made more progress because we can - * cycle back to the first root and first inode if it's not the first - * time the shrinker ran, see the above logic. Also a task that started - * later may finish ealier than another task and made less progress. So - * make this simple and update to the progress of the last task that - * finished, with the occasional possiblity of having two consecutive - * runs of the shrinker process the same inodes. - */ - spin_lock(&fs_info->extent_map_shrinker_lock); - fs_info->extent_map_shrinker_last_ino = ctx.last_ino; - fs_info->extent_map_shrinker_last_root = ctx.last_root; - spin_unlock(&fs_info->extent_map_shrinker_lock); - if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, - nr, ctx.last_root, - ctx.last_ino); + trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); } - return nr_dropped; + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); +} + +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +{ + /* + * Do nothing if the shrinker is already running. In case of high memory + * pressure we can have a lot of tasks calling us and all passing the + * same nr_to_scan value, but in reality we may need only to free + * nr_to_scan extent maps (or less). In case we need to free more than + * that, we will be called again by the fs shrinker, so no worries about + * not doing enough work to reclaim memory from extent maps. + * We can also be repeatedly called with the same nr_to_scan value + * simply because the shrinker runs asynchronously and multiple calls + * to this function are made before the shrinker does enough progress. + * + * That's why we set the atomic counter to nr_to_scan only if its + * current value is zero, instead of incrementing the counter by + * nr_to_scan. + */ + if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) + return; + + queue_work(system_unbound_wq, &fs_info->em_shrinker_work); +} + +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) +{ + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); + INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker); } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 5154a8f1d26c..cd123b266b64 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map *new_em, bool modified); -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index df7f09f3b02e..b80c07ad8c5e 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -186,7 +186,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * we have in the cache is the last delalloc range we * found while the file extent item we found can be * either for a whole delalloc range we previously - * emmitted or only a part of that range. + * emitted or only a part of that range. * * We have two cases here: * @@ -194,13 +194,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * cached extent's end. In this case just ignore the * current file extent item because we don't want to * overlap with previous ranges that may have been - * emmitted already; + * emitted already; * * 2) The file extent item starts behind the currently * cached extent but its end offset goes beyond the * end offset of the cached extent. We don't want to * overlap with a previous range that may have been - * emmitted already, so we emit the currently cached + * emitted already, so we emit the currently cached * extent and then partially store the current file * extent item's range in the cache, for the subrange * going the cached extent's end to the end of the diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4fb521d91b06..588c353d2969 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -37,33 +37,30 @@ #include "file.h" #include "super.h" -/* simple helper to fault in pages and copy. This should go away - * and be replaced with calls into generic code. +/* + * Helper to fault in page and copy. This should go away and be replaced with + * calls into generic code. */ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, - struct page **prepared_pages, - struct iov_iter *i) + struct folio *folio, struct iov_iter *i) { size_t copied = 0; size_t total_copied = 0; - int pg = 0; int offset = offset_in_page(pos); while (write_bytes > 0) { - size_t count = min_t(size_t, - PAGE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[pg]; + size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes); /* * Copy data from userspace to the current page */ - copied = copy_page_from_iter_atomic(page, offset, count, i); + copied = copy_folio_from_iter_atomic(folio, offset, count, i); /* Flush processor's dcache for this page */ - flush_dcache_page(page); + flush_dcache_folio(folio); /* * if we get a partial write, we can end up with - * partially up to date pages. These add + * partially up to date page. These add * a lot of complexity, so make sure they don't * happen by forcing this copy to be retried. * @@ -71,7 +68,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, * back to page at a time copies after we return 0. */ if (unlikely(copied < count)) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { iov_iter_revert(i, copied); copied = 0; } @@ -82,54 +79,44 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, write_bytes -= copied; total_copied += copied; offset += copied; - if (offset == PAGE_SIZE) { - pg++; - offset = 0; - } } return total_copied; } /* - * unlocks pages after btrfs_file_write is done with them + * Unlock folio after btrfs_file_write() is done with it. */ -static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, - struct page **pages, size_t num_pages, +static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, u64 pos, u64 copied) { - size_t i; u64 block_start = round_down(pos, fs_info->sectorsize); u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; ASSERT(block_len <= U32_MAX); - for (i = 0; i < num_pages; i++) { - /* page checked is some magic around finding pages that - * have been modified without going through btrfs_set_page_dirty - * clear it here. There should be no need to mark the pages - * accessed as prepare_pages should have marked them accessed - * in prepare_pages via find_or_create_page() - */ - btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), - block_start, block_len); - unlock_page(pages[i]); - put_page(pages[i]); - } + /* + * Folio checked is some magic around finding folios that have been + * modified without going through btrfs_dirty_folio(). Clear it here. + * There should be no need to mark the pages accessed as + * prepare_one_folio() should have marked them accessed in + * prepare_one_folio() via find_or_create_page() + */ + btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); + folio_unlock(folio); + folio_put(folio); } /* * After btrfs_copy_from_user(), update the following things for delalloc: - * - Mark newly dirtied pages as DELALLOC in the io tree. + * - Mark newly dirtied folio as DELALLOC in the io tree. * Used to advise which range is to be written back. - * - Mark modified pages as Uptodate/Dirty and not needing COW fixup + * - Mark modified folio as Uptodate/Dirty and not needing COW fixup * - Update inode size for past EOF write */ -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve) +int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, + size_t write_bytes, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; - int i; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -147,6 +134,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); + ASSERT(folio_pos(folio) <= pos && + folio_pos(folio) + folio_size(folio) >= pos + write_bytes); end_of_last_block = start_pos + num_bytes - 1; @@ -163,16 +152,9 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, if (ret) return ret; - for (i = 0; i < num_pages; i++) { - struct page *p = pages[i]; - - btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p), - start_pos, num_bytes); - btrfs_folio_clamp_clear_checked(fs_info, page_folio(p), - start_pos, num_bytes); - btrfs_folio_clamp_set_dirty(fs_info, page_folio(p), - start_pos, num_bytes); - } + btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); + btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); + btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes); /* * we've only changed i_size in ram, and we haven't updated @@ -851,53 +833,47 @@ out: } /* - * on error we return an unlocked page and the error value - * on success we return a locked page and 0 + * On error return an unlocked folio and the error value + * On success return a locked folio and 0 */ -static int prepare_uptodate_page(struct inode *inode, - struct page *page, u64 pos, - bool force_uptodate) +static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, + u64 len, bool force_uptodate) { - struct folio *folio = page_folio(page); + u64 clamp_start = max_t(u64, pos, folio_pos(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); int ret = 0; - if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && - !PageUptodate(page)) { - ret = btrfs_read_folio(NULL, folio); - if (ret) - return ret; - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - return -EIO; - } - - /* - * Since btrfs_read_folio() will unlock the folio before it - * returns, there is a window where btrfs_release_folio() can be - * called to release the page. Here we check both inode - * mapping and PagePrivate() to make sure the page was not - * released. - * - * The private flag check is essential for subpage as we need - * to store extra bitmap using folio private. - */ - if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { - unlock_page(page); - return -EAGAIN; - } - } - return 0; -} + if (folio_test_uptodate(folio)) + return 0; -static fgf_t get_prepare_fgp_flags(bool nowait) -{ - fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + if (!force_uptodate && + IS_ALIGNED(clamp_start, PAGE_SIZE) && + IS_ALIGNED(clamp_end, PAGE_SIZE)) + return 0; - if (nowait) - fgp_flags |= FGP_NOWAIT; + ret = btrfs_read_folio(NULL, folio); + if (ret) + return ret; + folio_lock(folio); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + return -EIO; + } - return fgp_flags; + /* + * Since btrfs_read_folio() will unlock the folio before it returns, + * there is a window where btrfs_release_folio() can be called to + * release the page. Here we check both inode mapping and page + * private to make sure the page was not released. + * + * The private flag check is essential for subpage as we need to store + * extra bitmap using folio private. + */ + if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { + folio_unlock(folio); + return -EAGAIN; + } + return 0; } static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) @@ -914,89 +890,67 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) } /* - * this just gets pages into the page cache and locks them down. + * Get folio into the page cache and lock it. */ -static noinline int prepare_pages(struct inode *inode, struct page **pages, - size_t num_pages, loff_t pos, - size_t write_bytes, bool force_uptodate, - bool nowait) +static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, + loff_t pos, size_t write_bytes, + bool force_uptodate, bool nowait) { - int i; unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - fgf_t fgp_flags = get_prepare_fgp_flags(nowait); + fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN); + struct folio *folio; int ret = 0; - int faili; - for (i = 0; i < num_pages; i++) { again: - pages[i] = pagecache_get_page(inode->i_mapping, index + i, - fgp_flags, mask | __GFP_WRITE); - if (!pages[i]) { - faili = i - 1; - if (nowait) - ret = -EAGAIN; - else - ret = -ENOMEM; - goto fail; - } - - ret = set_page_extent_mapped(pages[i]); - if (ret < 0) { - faili = i; - goto fail; - } - - if (i == 0) - ret = prepare_uptodate_page(inode, pages[i], pos, - force_uptodate); - if (!ret && i == num_pages - 1) - ret = prepare_uptodate_page(inode, pages[i], - pos + write_bytes, false); - if (ret) { - put_page(pages[i]); - if (!nowait && ret == -EAGAIN) { - ret = 0; - goto again; - } - faili = i - 1; - goto fail; + folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); + if (IS_ERR(folio)) { + if (nowait) + ret = -EAGAIN; + else + ret = PTR_ERR(folio); + return ret; + } + /* Only support page sized folio yet. */ + ASSERT(folio_order(folio) == 0); + ret = set_folio_extent_mapped(folio); + if (ret < 0) { + folio_unlock(folio); + folio_put(folio); + return ret; + } + ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate); + if (ret) { + /* The folio is already unlocked. */ + folio_put(folio); + if (!nowait && ret == -EAGAIN) { + ret = 0; + goto again; } - wait_on_page_writeback(pages[i]); + return ret; } - + *folio_ret = folio; return 0; -fail: - while (faili >= 0) { - unlock_page(pages[faili]); - put_page(pages[faili]); - faili--; - } - return ret; - } /* - * This function locks the extent and properly waits for data=ordered extents - * to finish before allowing the pages to be modified if need. + * Locks the extent and properly waits for data=ordered extents to finish + * before allowing the folios to be modified if need. * - * The return value: + * Return: * 1 - the extent is locked * 0 - the extent is not locked, and everything is OK - * -EAGAIN - need re-prepare the pages - * the other < 0 number - Something wrong happens + * -EAGAIN - need to prepare the folios again */ static noinline int -lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, - size_t write_bytes, +lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, + loff_t pos, size_t write_bytes, u64 *lockstart, u64 *lockend, bool nowait, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start_pos; u64 last_pos; - int i; int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); @@ -1008,12 +962,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, if (nowait) { if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, cached_state)) { - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - pages[i] = NULL; - } - + folio_unlock(folio); + folio_put(folio); return -EAGAIN; } } else { @@ -1027,10 +977,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, ordered->file_offset <= last_pos) { unlock_extent(&inode->io_tree, start_pos, last_pos, cached_state); - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - } + folio_unlock(folio); + folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; @@ -1044,11 +992,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } /* - * We should be called after prepare_pages() which should have locked + * We should be called after prepare_one_folio() which should have locked * all pages in the range. */ - for (i = 0; i < num_pages; i++) - WARN_ON(!PageLocked(pages[i])); + WARN_ON(!folio_test_locked(folio)); return ret; } @@ -1120,27 +1067,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } -static void update_time_for_write(struct inode *inode) -{ - struct timespec64 now, ts; - - if (IS_NOCMTIME(inode)) - return; - - now = current_time(inode); - ts = inode_get_mtime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_mtime_to_ts(inode, now); - - ts = inode_get_ctime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_ctime_to_ts(inode, now); - - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); -} - -int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) +int btrfs_write_check(struct kiocb *iocb, size_t count) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -1170,7 +1097,10 @@ int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ - update_time_for_write(inode); + if (!IS_NOCMTIME(inode)) { + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + inode_inc_iversion(inode); + } start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); @@ -1192,20 +1122,17 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) loff_t pos; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct page **pages = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; u64 lockend; size_t num_written = 0; - int nrptrs; ssize_t ret; - bool only_release_metadata = false; - bool force_page_uptodate = false; loff_t old_isize = i_size_read(inode); unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); + bool only_release_metadata = false; if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1218,38 +1145,26 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) if (ret <= 0) goto out; - ret = btrfs_write_check(iocb, i, ret); + ret = btrfs_write_check(iocb, ret); if (ret < 0) goto out; pos = iocb->ki_pos; - nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), - PAGE_SIZE / (sizeof(struct page *))); - nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); - nrptrs = max(nrptrs, 8); - pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out; - } - while (iov_iter_count(i) > 0) { struct extent_state *cached_state = NULL; size_t offset = offset_in_page(pos); size_t sector_offset; - size_t write_bytes = min(iov_iter_count(i), - nrptrs * (size_t)PAGE_SIZE - - offset); - size_t num_pages; + size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset); size_t reserve_bytes; - size_t dirty_pages; size_t copied; size_t dirty_sectors; size_t num_sectors; + struct folio *folio = NULL; int extents_locked; + bool force_page_uptodate = false; /* - * Fault pages before locking them in prepare_pages + * Fault pages before locking them in prepare_one_folio() * to avoid recursive lock */ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { @@ -1288,8 +1203,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) only_release_metadata = true; } - num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); - WARN_ON(num_pages > nrptrs); reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); WARN_ON(reserve_bytes == 0); @@ -1317,23 +1230,17 @@ again: break; } - /* - * This is going to setup the pages array with the number of - * pages we want, so we don't really need to worry about the - * contents of pages from loop to loop - */ - ret = prepare_pages(inode, pages, num_pages, - pos, write_bytes, force_page_uptodate, false); + ret = prepare_one_folio(inode, &folio, pos, write_bytes, + force_page_uptodate, false); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); break; } - extents_locked = lock_and_cleanup_extent_if_need( - BTRFS_I(inode), pages, - num_pages, pos, write_bytes, &lockstart, - &lockend, nowait, &cached_state); + extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode), + folio, pos, write_bytes, &lockstart, + &lockend, nowait, &cached_state); if (extents_locked < 0) { if (!nowait && extents_locked == -EAGAIN) goto again; @@ -1344,28 +1251,18 @@ again: break; } - copied = btrfs_copy_from_user(pos, write_bytes, pages, i); + copied = btrfs_copy_from_user(pos, write_bytes, folio, i); num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); dirty_sectors = round_up(copied + sector_offset, fs_info->sectorsize); dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); - /* - * if we have trouble faulting in the pages, fall - * back to one page at a time - */ - if (copied < write_bytes) - nrptrs = 1; - if (copied == 0) { force_page_uptodate = true; dirty_sectors = 0; - dirty_pages = 0; } else { force_page_uptodate = false; - dirty_pages = DIV_ROUND_UP(copied + offset, - PAGE_SIZE); } if (num_sectors > dirty_sectors) { @@ -1375,13 +1272,10 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { - u64 __pos; - - __pos = round_down(pos, - fs_info->sectorsize) + - (dirty_pages << PAGE_SHIFT); + u64 release_start = round_up(pos + copied, + fs_info->sectorsize); btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, __pos, + data_reserved, release_start, release_bytes, true); } } @@ -1389,15 +1283,14 @@ again: release_bytes = round_up(copied + sector_offset, fs_info->sectorsize); - ret = btrfs_dirty_pages(BTRFS_I(inode), pages, - dirty_pages, pos, copied, + ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied, &cached_state, only_release_metadata); /* * If we have not locked the extent range, because the range's * start offset is >= i_size, we might still have a non-NULL * cached extent state, acquired while marking the extent range - * as delalloc through btrfs_dirty_pages(). Therefore free any + * as delalloc through btrfs_dirty_page(). Therefore free any * possible cached extent state to avoid a memory leak. */ if (extents_locked) @@ -1408,7 +1301,7 @@ again: btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { - btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); + btrfs_drop_folio(fs_info, folio, pos, copied); break; } @@ -1416,7 +1309,7 @@ again: if (only_release_metadata) btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); + btrfs_drop_folio(fs_info, folio, pos, copied); cond_resched(); @@ -1424,8 +1317,6 @@ again: num_written += copied; } - kfree(pages); - if (release_bytes) { if (only_release_metadata) { btrfs_check_nocow_unlock(BTRFS_I(inode)); @@ -1470,7 +1361,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (ret || encoded->len == 0) goto out; - ret = btrfs_write_check(iocb, from, encoded->len); + ret = btrfs_write_check(iocb, encoded->len); if (ret < 0) goto out; @@ -3802,6 +3693,7 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, + .uring_cmd = btrfs_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index 912254e653cf..de89e644be29 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -34,9 +34,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); int btrfs_release_file(struct inode *inode, struct file *file); -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve); +int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, + size_t write_bytes, struct extent_state **cached, bool noreserve); int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes, bool nowait); @@ -44,7 +43,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode); bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret); -int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count); +int btrfs_write_check(struct kiocb *iocb, size_t count); ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i); #endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f4bcb2530660..cfa52ef40b06 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,6 +11,7 @@ #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> +#include <linux/string_choices.h> #include "ctree.h" #include "fs.h" #include "messages.h" @@ -1387,6 +1388,7 @@ static int __btrfs_write_out_cache(struct inode *inode, int bitmaps = 0; int ret; int must_iput = 0; + int i_size; if (!i_size_read(inode)) return -EIO; @@ -1457,11 +1459,16 @@ static int __btrfs_write_out_cache(struct inode *inode, io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, - io_ctl->num_pages, 0, i_size_read(inode), - &cached_state, false); - if (ret) - goto out_nospc; + i_size = i_size_read(inode); + for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) { + u64 dirty_start = i * PAGE_SIZE; + u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start; + + ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]), + dirty_start, dirty_len, &cached_state, false); + if (ret < 0) + goto out_nospc; + } if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); @@ -2936,12 +2943,11 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, if (info->bytes >= bytes && !block_group->ro) count++; btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", - info->offset, info->bytes, - (info->bitmap) ? "yes" : "no"); + info->offset, info->bytes, str_yes_no(info->bitmap)); } spin_unlock(&ctl->tree_lock); btrfs_info(fs_info, "block group has cluster?: %s", - list_empty(&block_group->cluster_list) ? "no" : "yes"); + str_no_yes(list_empty(&block_group->cluster_list))); btrfs_info(fs_info, "%d free space entries at or bigger than %llu bytes", count, bytes); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 79f64e383edd..79a1a3d6f04d 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -263,10 +263,10 @@ enum { BTRFS_FEATURE_INCOMPAT_ZONED | \ BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Features under developmen like Extent tree v2 support is enabled - * only under CONFIG_BTRFS_DEBUG. + * only under CONFIG_BTRFS_EXPERIMENTAL */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ @@ -317,6 +317,8 @@ struct btrfs_dev_replace { struct percpu_counter bio_counter; wait_queue_head_t replace_wait; + + struct task_struct *replace_task; }; /* @@ -633,9 +635,10 @@ struct btrfs_fs_info { s32 delalloc_batch; struct percpu_counter evictable_extent_maps; - spinlock_t extent_map_shrinker_lock; - u64 extent_map_shrinker_last_root; - u64 extent_map_shrinker_last_ino; + u64 em_shrinker_last_root; + u64 em_shrinker_last_ino; + atomic64_t em_shrinker_nr_to_scan; + struct work_struct em_shrinker_work; /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; @@ -876,12 +879,9 @@ struct btrfs_fs_info { #endif }; -#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \ - struct page *: (_page))->mapping->host)) #define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ struct folio *: (_folio))->mapping->host)) -#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info) #define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5618ca02934a..03fe0de2cd0d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -421,7 +421,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, index++; continue; } - folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); index++; if (IS_ERR(folio)) continue; @@ -556,8 +556,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, } else { struct folio *folio; - folio = __filemap_get_folio(inode->vfs_inode.i_mapping, - 0, 0, 0); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0); ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_local_folio(folio, 0); @@ -646,7 +645,7 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, * If being used directly, you must have already checked we're allowed to cow * the range by getting true from can_cow_file_range_inline(). */ -static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset, +static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 size, size_t compressed_size, int compress_type, struct folio *compressed_folio, @@ -736,7 +735,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, return 1; lock_extent(&inode->io_tree, offset, end, &cached); - ret = __cow_file_range_inline(inode, offset, size, compressed_size, + ret = __cow_file_range_inline(inode, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret > 0) { @@ -832,32 +831,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } /* - * Special check for subpage. + * Only enable sector perfect compression for experimental builds. * - * We lock the full page then run each delalloc range in the page, thus - * for the following case, we will hit some subpage specific corner case: + * This is a big feature change for subpage cases, and can hit + * different corner cases, so only limit this feature for + * experimental build for now. * - * 0 32K 64K - * | |///////| |///////| - * \- A \- B - * - * In above case, both range A and range B will try to unlock the full - * page [0, 64K), causing the one finished later will have page - * unlocked already, triggering various page lock requirement BUG_ON()s. - * - * So here we add an artificial limit that subpage compression can only - * if the range is fully page aligned. - * - * In theory we only need to ensure the first page is fully covered, but - * the tailing partial page will be locked until the full compression - * finishes, delaying the write of other range. - * - * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range - * first to prevent any submitted async extent to unlock the full page. - * By this, we can ensure for subpage case that only the last async_cow - * will unlock the full page. + * ETA for moving this out of experimental builds is 6.15. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->sectorsize < PAGE_SIZE && + !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(end + 1)) return 0; @@ -896,13 +879,14 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - folio = __filemap_get_folio(inode->i_mapping, index, 0, 0); + folio = filemap_get_folio(inode->i_mapping, index); if (IS_ERR(folio)) { if (!ret) ret = PTR_ERR(folio); continue; } - folio_clear_dirty_for_io(folio); + btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + end + 1 - start); folio_put(folio); } return ret; @@ -1001,17 +985,6 @@ again: (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; - /* - * For subpage case, we require full page alignment for the sector - * aligned range. - * Thus we must also check against @actual_end, not just @end. - */ - if (blocksize < PAGE_SIZE) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(round_up(actual_end, blocksize))) - goto cleanup_and_bail_uncompressed; - } - total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; @@ -1359,7 +1332,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; - unsigned long ram_size; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; @@ -1367,7 +1339,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct extent_map *em; unsigned clear_bits; unsigned long page_ops; - bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(inode)) { @@ -1421,8 +1392,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; - cur_alloc_size = num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, + ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { @@ -1453,9 +1423,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; - extent_reserved = true; - ram_size = ins.offset; file_extent.disk_bytenr = ins.objectid; file_extent.disk_num_bytes = ins.offset; file_extent.num_bytes = ins.offset; @@ -1463,14 +1431,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode, file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; - lock_extent(&inode->io_tree, start, start + ram_size - 1, + lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { unlock_extent(&inode->io_tree, start, - start + ram_size - 1, &cached); + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } @@ -1480,7 +1448,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, 1 << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { unlock_extent(&inode->io_tree, start, - start + ram_size - 1, &cached); + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1501,7 +1469,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, */ if (ret) btrfs_drop_extent_map_range(inode, start, - start + ram_size - 1, + start + cur_alloc_size - 1, false); } btrfs_put_ordered_extent(ordered); @@ -1513,13 +1481,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * (which the caller expects to stay locked), don't clear any * dirty bits and don't set any writeback bits * - * Do set the Ordered (Private2) bit so we know this page was + * Do set the Ordered flag so we know this page was * properly setup for writepage. */ page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; - extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, + extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); @@ -1529,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; - extent_reserved = false; + cur_alloc_size = 0; /* * btrfs_reloc_clone_csums() error, since start is increased @@ -1545,7 +1513,7 @@ done: return ret; out_drop_extent_cache: - btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); + btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1599,13 +1567,12 @@ out_unlock: * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ - if (extent_reserved) { + if (cur_alloc_size) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); - start += cur_alloc_size; } /* @@ -1614,11 +1581,13 @@ out_unlock: * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ - if (start < end) { + if (start + cur_alloc_size < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_folio, + extent_clear_unlock_delalloc(inode, start + cur_alloc_size, + end, locked_folio, &cached, clear_bits, page_ops); - btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); + btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, + end - start - cur_alloc_size + 1, NULL); } return ret; } @@ -1729,7 +1698,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, &locked_folio->page, + wbc_account_cgroup_owner(wbc, locked_folio, cur_end - start); async_chunk[i].locked_folio = locked_folio; locked_folio = NULL; @@ -3094,34 +3063,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - - btrfs_inode_safe_disk_i_size_write(inode, 0); - if (freespace_inode) - trans = btrfs_join_transaction_spacecache(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, inode); - if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, ret); - - ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) - btrfs_abort_transaction(trans, ret); - - goto out; - } - - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); - if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -3135,8 +3076,31 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + /* Logic error */ + ASSERT(list_empty(&ordered_extent->list)); + if (!list_empty(&ordered_extent->list)) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + btrfs_inode_safe_disk_i_size_write(inode, 0); + ret = btrfs_update_inode_fallback(trans, inode); + if (ret) { + /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, ret); + } goto out; + } + + clear_bits |= EXTENT_LOCKED; + lock_extent(io_tree, start, end, &cached_state); if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; @@ -3791,14 +3755,45 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) return 0; } +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) +{ + struct btrfs_root *root = inode->root; + struct btrfs_inode *existing; + const u64 ino = btrfs_ino(inode); + int ret; + + if (inode_unhashed(&inode->vfs_inode)) + return 0; + + if (prealloc) { + ret = xa_reserve(&root->inodes, ino, GFP_NOFS); + if (ret) + return ret; + } + + existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + + if (xa_is_err(existing)) { + ret = xa_err(existing); + ASSERT(ret != -EINVAL); + ASSERT(ret != -ENOMEM); + return ret; + } else if (existing) { + WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + } + + return 0; +} + /* - * read an inode from the btree into the in-memory inode + * Read a locked inode from the btree into the in-memory inode and add it to + * its root list/tree. + * + * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, - struct btrfs_path *in_path) +static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3812,25 +3807,25 @@ static int btrfs_read_locked_inode(struct inode *inode, ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); if (ret) - return ret; + goto out; ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; - if (!path) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } + ASSERT(path); btrfs_get_inode_key(BTRFS_I(inode), &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - if (path != in_path) - btrfs_free_path(path); - return ret; + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_lookup_inode(), this means the inode was not found. + */ + if (ret > 0) + ret = -ENOENT; + goto out; } leaf = path->nodes[0]; @@ -3965,8 +3960,6 @@ cache_acl: btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } - if (path != in_path) - btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); @@ -3993,7 +3986,15 @@ cache_acl: } btrfs_sync_inode_flags_to_i_flags(inode); + + ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + if (ret) + goto out; + return 0; +out: + iget_failed(inode); + return ret; } /* @@ -4368,11 +4369,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); - if (IS_ERR_OR_NULL(di)) { - if (!di) - ret = -ENOENT; - else - ret = PTR_ERR(di); + if (IS_ERR(di)) { + ret = PTR_ERR(di); btrfs_abort_transaction(trans, ret); goto out; } @@ -5505,35 +5503,7 @@ out: return err; } -static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) -{ - struct btrfs_root *root = inode->root; - struct btrfs_inode *existing; - const u64 ino = btrfs_ino(inode); - int ret; - if (inode_unhashed(&inode->vfs_inode)) - return 0; - - if (prealloc) { - ret = xa_reserve(&root->inodes, ino, GFP_NOFS); - if (ret) - return ret; - } - - existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); - - if (xa_is_err(existing)) { - ret = xa_err(existing); - ASSERT(ret != -EINVAL); - ASSERT(ret != -ENOMEM); - return ret; - } else if (existing) { - WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); - } - - return 0; -} static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { @@ -5595,10 +5565,8 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) } /* - * Get an inode object given its inode number and corresponding root. - * Path can be preallocated to prevent recursing back to iget through - * allocator. NULL is also valid but may require an additional allocation - * later. + * Get an inode object given its inode number and corresponding root. Path is + * preallocated to prevent recursing back to iget through allocator. */ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path) @@ -5614,30 +5582,40 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, return inode; ret = btrfs_read_locked_inode(inode, path); - /* - * ret > 0 can come from btrfs_search_slot called by - * btrfs_read_locked_inode(), this means the inode item was not found. - */ - if (ret > 0) - ret = -ENOENT; - if (ret < 0) - goto error; - - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); - if (ret < 0) - goto error; + if (ret) + return ERR_PTR(ret); unlock_new_inode(inode); - return inode; -error: - iget_failed(inode); - return ERR_PTR(ret); } +/* + * Get an inode object given its inode number and corresponding root. + */ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(ino, root, NULL); + struct inode *inode; + struct btrfs_path *path; + int ret; + + inode = btrfs_iget_locked(ino, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) + return inode; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_read_locked_inode(inode, path); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + unlock_new_inode(inode); + return inode; } static struct inode *new_simple_dir(struct inode *dir, @@ -6026,7 +6004,7 @@ again: * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as - * they're returned by readdir. Until we re-use freed offsets + * they're returned by readdir. Until we reuse freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. @@ -6768,8 +6746,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, return ret; } -static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct folio *folio) +static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { struct btrfs_file_extent_item *fi; void *kaddr; @@ -6967,7 +6944,7 @@ next: ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, folio); + ret = read_inline_extent(path, folio); if (ret < 0) goto out; goto insert; @@ -7297,7 +7274,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * * But already submitted bio can still be finished on this folio. * Furthermore, endio function won't skip folio which has Ordered - * (Private2) already cleared, so it's possible for endio and + * already cleared, so it's possible for endio and * invalidate_folio to do the same ordered extent accounting twice * on one folio. * @@ -7363,7 +7340,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, range_len = range_end + 1 - cur; if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* - * If Ordered (Private2) is cleared, it means endio has + * If Ordered is cleared, it means endio has * already been executed for the range. * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. @@ -7436,7 +7413,7 @@ next: } /* * We have iterated through all ordered extents of the page, the page - * should not have Ordered (Private2) anymore, or the above iteration + * should not have Ordered anymore, or the above iteration * did something wrong. */ ASSERT(!folio_test_ordered(folio)); @@ -8975,28 +8952,6 @@ out_inode: return finish_open_simple(file, ret); } -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct folio *folio; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; - while (index <= end_index) { - folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); - ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */ - - /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(folio) == 0); - btrfs_folio_set_writeback(fs_info, folio, start, len); - folio_put(folio); - index++; - } -} - int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type) { @@ -9041,12 +8996,16 @@ static ssize_t btrfs_encoded_read_inline( unsigned long ptr; void *tmp; ssize_t ret; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } + + path->nowait = nowait; + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { @@ -9110,6 +9069,7 @@ out: struct btrfs_encoded_read_private { wait_queue_head_t wait; + void *uring_ctx; atomic_t pending; blk_status_t status; }; @@ -9129,26 +9089,40 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (!atomic_dec_return(&priv->pending)) - wake_up(&priv->wait); + if (atomic_dec_return(&priv->pending) == 0) { + int err = blk_status_to_errno(READ_ONCE(priv->status)); + + if (priv->uring_ctx) { + btrfs_uring_read_extent_endio(priv->uring_ctx, err); + kfree(priv); + } else { + wake_up(&priv->wait); + } + } bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, struct page **pages) + u64 disk_bytenr, u64 disk_io_size, + struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private priv = { - .pending = ATOMIC_INIT(1), - }; + struct btrfs_encoded_read_private *priv; unsigned long i = 0; struct btrfs_bio *bbio; + int ret; + + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; - init_waitqueue_head(&priv.wait); + init_waitqueue_head(&priv->wait); + atomic_set(&priv->pending, 1); + priv->status = 0; + priv->uring_ctx = uring_ctx; bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; @@ -9156,11 +9130,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv.pending); + atomic_inc(&priv->pending); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; continue; @@ -9171,22 +9145,33 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv.pending); + atomic_inc(&priv->pending); btrfs_submit_bbio(bbio, 0); - if (atomic_dec_return(&priv.pending)) - io_wait_event(priv.wait, !atomic_read(&priv.pending)); - /* See btrfs_encoded_read_endio() for ordering. */ - return blk_status_to_errno(READ_ONCE(priv.status)); + if (uring_ctx) { + if (atomic_dec_return(&priv->pending) == 0) { + ret = blk_status_to_errno(READ_ONCE(priv->status)); + btrfs_uring_read_extent_endio(uring_ctx, ret); + kfree(priv); + return ret; + } + + return -EIOCBQUEUED; + } else { + if (atomic_dec_return(&priv->pending) != 0) + io_wait_event(priv->wait, !atomic_read(&priv->pending)); + /* See btrfs_encoded_read_endio() for ordering. */ + ret = blk_status_to_errno(READ_ONCE(priv->status)); + kfree(priv); + return ret; + } } -static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, - struct iov_iter *iter, - u64 start, u64 lockend, - struct extent_state **cached_state, - u64 disk_bytenr, u64 disk_io_size, - size_t count, bool compressed, - bool *unlocked) +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; @@ -9206,8 +9191,8 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, goto out; } - ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, - disk_io_size, pages); + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, NULL); if (ret) goto out; @@ -9247,21 +9232,26 @@ out: } ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded) + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; ssize_t ret; size_t count = iov_iter_count(iter); - u64 start, lockend, disk_bytenr, disk_io_size; - struct extent_state *cached_state = NULL; + u64 start, lockend; struct extent_map *em; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); bool unlocked = false; file_accessed(iocb->ki_filp); - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + ret = btrfs_inode_lock(inode, + BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0)); + if (ret) + return ret; if (iocb->ki_pos >= inode->vfs_inode.i_size) { btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -9274,21 +9264,46 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, */ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - for (;;) { + if (nowait) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(inode, start, - lockend - start + 1); - if (ret) + if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping, + start, lockend)) { + ret = -EAGAIN; + goto out_unlock_inode; + } + + if (!try_lock_extent(io_tree, start, lockend, cached_state)) { + ret = -EAGAIN; goto out_unlock_inode; - lock_extent(io_tree, start, lockend, &cached_state); + } + ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); - if (!ordered) - break; - btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, &cached_state); - cond_resched(); + if (ordered) { + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + ret = -EAGAIN; + goto out_unlock_inode; + } + } else { + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + + lock_extent(io_tree, start, lockend, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + cond_resched(); + } } em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); @@ -9307,9 +9322,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, - &cached_state, extent_start, + cached_state, extent_start, count, encoded, &unlocked); - goto out; + goto out_unlock_extent; } /* @@ -9320,12 +9335,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, inode->vfs_inode.i_size) - iocb->ki_pos; if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { - disk_bytenr = EXTENT_MAP_HOLE; + *disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; } else if (extent_map_is_compressed(em)) { - disk_bytenr = em->disk_bytenr; + *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole * compressed extent. @@ -9334,7 +9349,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = -ENOBUFS; goto out_em; } - disk_io_size = em->disk_num_bytes; + *disk_io_size = em->disk_num_bytes; count = em->disk_num_bytes; encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); @@ -9344,47 +9359,42 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, goto out_em; encoded->compression = ret; } else { - disk_bytenr = extent_map_block_start(em) + (start - em->start); + *disk_bytenr = extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* * Don't read beyond what we locked. This also limits the page * allocations that we'll do. */ - disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; - count = start + disk_io_size - iocb->ki_pos; + *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + *disk_io_size - iocb->ki_pos; encoded->len = count; encoded->unencoded_len = count; - disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } free_extent_map(em); em = NULL; - if (disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, &cached_state); + if (*disk_bytenr == EXTENT_MAP_HOLE) { + unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) ret = -EFAULT; } else { - ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, - &cached_state, disk_bytenr, - disk_io_size, count, - encoded->compression, - &unlocked); + ret = -EIOCBQUEUED; + goto out_unlock_extent; } -out: - if (ret >= 0) - iocb->ki_pos += encoded->len; out_em: free_extent_map(em); out_unlock_extent: - if (!unlocked) - unlock_extent(io_tree, start, lockend, &cached_state); + /* Leave inode and extent locked if we need to do a read. */ + if (!unlocked && ret != -EIOCBQUEUED) + unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: - if (!unlocked) + if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } @@ -9495,7 +9505,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); - folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); if (!folios) return -ENOMEM; for (i = 0; i < nr_folios; i++) { @@ -9559,7 +9569,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (encoded->unencoded_len == encoded->len && encoded->unencoded_offset == 0 && can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { - ret = __cow_file_range_inline(inode, start, encoded->len, + ret = __cow_file_range_inline(inode, encoded->len, orig_count, compression, folios[0], true); if (ret <= 0) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 226c91fe31a7..c9302d193187 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -29,6 +29,7 @@ #include <linux/fileattr.h> #include <linux/fsverity.h> #include <linux/sched/xacct.h> +#include <linux/io_uring/cmd.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -1048,7 +1049,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { int ret; - bool snapshot_force_cow = false; /* * Force new buffered writes to reserve space even when NOCOW is @@ -1067,15 +1067,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent, * creation. */ atomic_inc(&root->snapshot_force_cow); - snapshot_force_cow = true; btrfs_wait_ordered_extents(root, U64_MAX, NULL); ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); + atomic_dec(&root->snapshot_force_cow); out: - if (snapshot_force_cow) - atomic_dec(&root->snapshot_force_cow); btrfs_drew_read_unlock(&root->snapshot_lock); return ret; } @@ -1308,9 +1306,9 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, ret = btrfs_mksubvol(&file->f_path, idmap, name, namelen, NULL, readonly, inherit); } else { - struct fd src = fdget(fd); + CLASS(fd, src)(fd); struct inode *src_inode; - if (!fd_file(src)) { + if (fd_empty(src)) { ret = -EINVAL; goto out_drop_write; } @@ -1341,7 +1339,6 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, BTRFS_I(src_inode)->root, readonly, inherit); } - fdput(src); } out_drop_write: mnt_drop_write_file(file); @@ -4058,8 +4055,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, return 0; } -static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, - void __user *arg) +static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4514,12 +4510,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; + struct btrfs_inode *inode = BTRFS_I(file_inode(file)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; loff_t pos; struct kiocb kiocb; ssize_t ret; + u64 disk_bytenr, disk_io_size; + struct extent_state *cached_state = NULL; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4572,7 +4573,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos; - ret = btrfs_encoded_read(&kiocb, &iter, &args); + ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + &disk_bytenr, &disk_io_size); + + if (ret == -EIOCBQUEUED) { + bool unlocked = false; + u64 start, lockend, count; + + start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + if (args.compression) + count = disk_io_size; + else + count = args.len; + + ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend, + &cached_state, disk_bytenr, + disk_io_size, count, + args.compression, &unlocked); + + if (!unlocked) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + } + if (ret >= 0) { fsnotify_access(file); if (copy_to_user(argp + copy_end, @@ -4690,6 +4716,439 @@ out_acct: return ret; } +/* + * Context that's attached to an encoded read io_uring command, in cmd->pdu. It + * contains the fields in btrfs_uring_read_extent that are necessary to finish + * off and cleanup the I/O in btrfs_uring_read_finished. + */ +struct btrfs_uring_priv { + struct io_uring_cmd *cmd; + struct page **pages; + unsigned long nr_pages; + struct kiocb iocb; + struct iovec *iov; + struct iov_iter iter; + struct extent_state *cached_state; + u64 count; + u64 start; + u64 lockend; + int err; + bool compressed; +}; + +struct io_btrfs_cmd { + struct btrfs_uring_priv *priv; +}; + +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_priv *priv = bc->priv; + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + unsigned long index; + u64 cur; + size_t page_offset; + ssize_t ret; + + if (priv->err) { + ret = priv->err; + goto out; + } + + if (priv->compressed) { + index = 0; + page_offset = 0; + } else { + index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; + page_offset = offset_in_page(priv->iocb.ki_pos - priv->start); + } + cur = 0; + while (cur < priv->count) { + size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset); + + if (copy_page_to_iter(priv->pages[index], page_offset, bytes, + &priv->iter) != bytes) { + ret = -EFAULT; + goto out; + } + + index++; + cur += bytes; + page_offset = 0; + } + ret = priv->count; + +out: + unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); + add_rchar(current, ret); + + for (index = 0; index < priv->nr_pages; index++) + __free_page(priv->pages[index]); + + kfree(priv->pages); + kfree(priv->iov); + kfree(priv); +} + +void btrfs_uring_read_extent_endio(void *ctx, int err) +{ + struct btrfs_uring_priv *priv = ctx; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd); + + priv->err = err; + bc->priv = priv; + + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); +} + +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state *cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + struct iovec *iov, struct io_uring_cmd *cmd) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + struct btrfs_uring_priv *priv = NULL; + unsigned long nr_pages; + int ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + ret = btrfs_alloc_page_array(nr_pages, pages, 0); + if (ret) { + ret = -ENOMEM; + goto out_fail; + } + + priv = kmalloc(sizeof(*priv), GFP_NOFS); + if (!priv) { + ret = -ENOMEM; + goto out_fail; + } + + priv->iocb = *iocb; + priv->iov = iov; + priv->iter = *iter; + priv->count = count; + priv->cmd = cmd; + priv->cached_state = cached_state; + priv->compressed = compressed; + priv->nr_pages = nr_pages; + priv->pages = pages; + priv->start = start; + priv->lockend = lockend; + priv->err = 0; + + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, priv); + if (ret && ret != -EIOCBQUEUED) + goto out_fail; + + /* + * If we return -EIOCBQUEUED, we're deferring the cleanup to + * btrfs_uring_read_finished(), which will handle unlocking the extent + * and inode and freeing the allocations. + */ + + return -EIOCBQUEUED; + +out_fail: + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + kfree(priv); + return ret; +} + +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); + size_t copy_end; + struct btrfs_ioctl_encoded_io_args args = { 0 }; + int ret; + u64 disk_bytenr, disk_io_size; + struct file *file; + struct btrfs_inode *inode; + struct btrfs_fs_info *fs_info; + struct extent_io_tree *io_tree; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + struct extent_state *cached_state = NULL; + u64 start, lockend; + void __user *sqe_addr; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + file = cmd->file; + inode = BTRFS_I(file->f_inode); + fs_info = inode->root->fs_info; + io_tree = &inode->io_tree; + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + if (copy_from_user(&args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (args.flags != 0) + return -EINVAL; + + ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_free; + } + + pos = args.offset; + ret = rw_verify_area(READ, file, &pos, args.len); + if (ret < 0) + goto out_free; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + if (issue_flags & IO_URING_F_NONBLOCK) + kiocb.ki_flags |= IOCB_NOWAIT; + + start = ALIGN_DOWN(pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + &disk_bytenr, &disk_io_size); + if (ret < 0 && ret != -EIOCBQUEUED) + goto out_free; + + file_accessed(file); + + if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel, + sizeof(args) - copy_end_kernel)) { + if (ret == -EIOCBQUEUED) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + ret = -EFAULT; + goto out_free; + } + + if (ret == -EIOCBQUEUED) { + u64 count; + + /* + * If we've optimized things by storing the iovecs on the stack, + * undo this. + */ + if (!iov) { + iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); + if (!iov) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + ret = -ENOMEM; + goto out_acct; + } + + memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); + } + + count = min_t(u64, iov_iter_count(&iter), disk_io_size); + + /* Match ioctl by not returning past EOF if uncompressed. */ + if (!args.compression) + count = min_t(u64, count, args.len); + + ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, + cached_state, disk_bytenr, + disk_io_size, count, + args.compression, iov, cmd); + + goto out_acct; + } + +out_free: + kfree(iov); + +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + + return ret; +} + +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + switch (cmd->cmd_op) { + case BTRFS_IOC_ENCODED_READ: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: +#endif + return btrfs_uring_encoded_read(cmd, issue_flags); + } + + return -EINVAL; +} + +static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp) +{ + struct btrfs_root *root; + struct btrfs_ioctl_subvol_wait args = { 0 }; + signed long sched_ret; + int refs; + u64 root_flags; + bool wait_for_deletion = false; + bool found = false; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + switch (args.mode) { + case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED: + /* + * Wait for the first one deleted that waits until all previous + * are cleaned. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + found = true; + } + spin_unlock(&fs_info->trans_lock); + if (!found) + return -ENOENT; + + fallthrough; + case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE: + if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) || + BTRFS_LAST_FREE_OBJECTID < args.subvolid) + return -EINVAL; + break; + case BTRFS_SUBVOL_SYNC_COUNT: + spin_lock(&fs_info->trans_lock); + args.count = list_count_nodes(&fs_info->dead_roots); + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_FIRST: + spin_lock(&fs_info->trans_lock); + /* Last in the list was deleted first. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_LAST: + spin_lock(&fs_info->trans_lock); + /* First in the list was deleted last. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + default: + return -EINVAL; + } + + /* 32bit limitation: fs_roots_radix key is not wide enough. */ + if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX) + return -EOVERFLOW; + + while (1) { + /* Wait for the specific one. */ + if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR) + return -EINTR; + refs = -1; + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)args.subvolid); + if (root) { + spin_lock(&root->root_item_lock); + refs = btrfs_root_refs(&root->root_item); + root_flags = btrfs_root_flags(&root->root_item); + spin_unlock(&root->root_item_lock); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + up_read(&fs_info->subvol_sem); + + /* Subvolume does not exist. */ + if (!root) + return -ENOENT; + + /* Subvolume not deleted at all. */ + if (refs > 0) + return -EEXIST; + /* We've waited and now the subvolume is gone. */ + if (wait_for_deletion && refs == -1) { + /* Return the one we waited for as the last one. */ + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + } + + /* Subvolume not found on the first try (deleted or never existed). */ + if (refs == -1) + return -ENOENT; + + wait_for_deletion = true; + ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD); + sched_ret = schedule_timeout_interruptible(HZ); + /* Early wake up or error. */ + if (sched_ret != 0) + return -EINTR; + } + + return 0; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4812,7 +5271,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_QUOTA_RESCAN_STATUS: return btrfs_ioctl_quota_rescan_status(fs_info, argp); case BTRFS_IOC_QUOTA_RESCAN_WAIT: - return btrfs_ioctl_quota_rescan_wait(fs_info, argp); + return btrfs_ioctl_quota_rescan_wait(fs_info); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: @@ -4841,6 +5300,8 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_ENCODED_WRITE_32: return btrfs_ioctl_encoded_write(file, argp, true); #endif + case BTRFS_IOC_SUBVOL_SYNC_WAIT: + return btrfs_ioctl_subvol_sync(fs_info, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 19cd26b0244a..2b760c8778f8 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -22,5 +22,7 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(const u8 *uuid); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +void btrfs_uring_read_extent_endio(void *ctx, int err); #endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 6a0b7abb5bd9..9a7a7b723305 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -162,21 +162,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) } /* - * Try-lock for write. - * - * Return 1 if the rwlock has been taken, 0 otherwise - */ -int btrfs_try_tree_write_lock(struct extent_buffer *eb) -{ - if (down_write_trylock(&eb->lock)) { - btrfs_set_eb_lock_owner(eb, current->pid); - trace_btrfs_try_tree_write_lock(eb); - return 1; - } - return 0; -} - -/* * Release read lock. */ void btrfs_tree_read_unlock(struct extent_buffer *eb) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 3c15c75e0582..46c8be2afab1 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -180,7 +180,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); -int btrfs_try_tree_write_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 72856f6775f7..a45bc11f8665 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *lzo_alloc_workspace(unsigned int level) +struct list_head *lzo_alloc_workspace(void) { struct workspace *workspace; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2104d60c2161..95c8499a159a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -346,10 +346,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); /* - * Ordered (Private2) bit indicates whether we still have + * Ordered flag indicates whether we still have * pending io unfinished for the ordered extent. * - * If there's no such bit, we need to skip to next range. + * If it's not set, we need to skip to next range. */ if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) return false; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c297909f1506..a6f92836c9b1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -226,8 +226,7 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, return qgroup; } -static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup *qgroup) +static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) { struct btrfs_qgroup_list *list; @@ -258,7 +257,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return -ENOENT; rb_erase(&qgroup->node, &fs_info->qgroup_tree); - __del_qgroup_rb(fs_info, qgroup); + __del_qgroup_rb(qgroup); return 0; } @@ -469,7 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) /* * If a qgroup exists for a subvolume ID, it is possible * that subvolume has been deleted, in which case - * re-using that ID would lead to incorrect accounting. + * reusing that ID would lead to incorrect accounting. * * Ensure that we skip any such subvol ids. * @@ -643,7 +642,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - __del_qgroup_rb(fs_info, qgroup); + __del_qgroup_rb(qgroup); btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); } @@ -1407,7 +1406,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -2001,20 +2000,30 @@ out: * Return <0 for insertion failure, caller can free @record safely. */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record, + u64 bytenr) { struct btrfs_qgroup_extent_record *existing, *ret; - unsigned long bytenr = record->bytenr; + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); if (!btrfs_qgroup_full_accounting(fs_info)) return 1; - lockdep_assert_held(&delayed_refs->lock); - trace_btrfs_qgroup_trace_extent(fs_info, record); +#if BITS_PER_LONG == 32 + if (bytenr >= MAX_LFS_FILESIZE) { + btrfs_err_rl(fs_info, +"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", + bytenr); + btrfs_err_32bit_limit(fs_info); + return -EOVERFLOW; + } +#endif + + trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr); xa_lock(&delayed_refs->dirty_extents); - existing = xa_load(&delayed_refs->dirty_extents, bytenr); + existing = xa_load(&delayed_refs->dirty_extents, index); if (existing) { if (record->data_rsv && !existing->data_rsv) { existing->data_rsv = record->data_rsv; @@ -2024,7 +2033,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, return 1; } - ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC); + ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC); xa_unlock(&delayed_refs->dirty_extents); if (xa_is_err(ret)) { qgroup_mark_inconsistent(fs_info); @@ -2056,12 +2065,17 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, * transaction committing, but not now as qgroup accounting will be wrong again. */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord) + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr) { - struct btrfs_backref_walk_ctx ctx = { 0 }; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_backref_walk_ctx ctx = { + .bytenr = bytenr, + .fs_info = fs_info, + }; int ret; - if (!btrfs_qgroup_full_accounting(trans->fs_info)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* * We are always called in a context where we are already holding a @@ -2084,16 +2098,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, */ ASSERT(trans != NULL); - if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) + if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0; - ctx.bytenr = qrecord->bytenr; - ctx.fs_info = trans->fs_info; - ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { - qgroup_mark_inconsistent(trans->fs_info); - btrfs_warn(trans->fs_info, + qgroup_mark_inconsistent(fs_info); + btrfs_warn(fs_info, "error accounting new delayed refs extent (err code: %d), quota inconsistent", ret); return 0; @@ -2128,7 +2139,8 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; - struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); int ret; if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) @@ -2137,26 +2149,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!record) return -ENOMEM; - if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) { + if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { kfree(record); return -ENOMEM; } - delayed_refs = &trans->transaction->delayed_refs; - record->bytenr = bytenr; record->num_bytes = num_bytes; - record->old_roots = NULL; - spin_lock(&delayed_refs->lock); - ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); - spin_unlock(&delayed_refs->lock); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ - xa_release(&delayed_refs->dirty_extents, record->bytenr); + xa_release(&delayed_refs->dirty_extents, index); kfree(record); return 0; } - return btrfs_qgroup_trace_extent_post(trans, record); + return btrfs_qgroup_trace_extent_post(trans, record, bytenr); } /* @@ -2641,7 +2648,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, if (!extent_buffer_uptodate(root_eb)) { struct btrfs_tree_parent_check check = { - .has_first_key = false, .transid = root_gen, .level = root_level }; @@ -3032,14 +3038,16 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; xa_for_each(&delayed_refs->dirty_extents, index, record) { + const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits); + num_dirty_extents++; - trace_btrfs_qgroup_account_extents(fs_info, record); + trace_btrfs_qgroup_account_extents(fs_info, record, bytenr); if (!ret && !(fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { struct btrfs_backref_walk_ctx ctx = { 0 }; - ctx.bytenr = record->bytenr; + ctx.bytenr = bytenr; ctx.fs_info = fs_info; /* @@ -3081,7 +3089,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) ulist_del(record->old_roots, qgroup_to_skip, 0); } - ret = btrfs_qgroup_account_extent(trans, record->bytenr, + ret = btrfs_qgroup_account_extent(trans, bytenr, record->num_bytes, record->old_roots, new_roots); @@ -4185,13 +4193,20 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } - btrfs_run_delayed_iputs(root->fs_info); - btrfs_wait_on_delayed_iputs(root->fs_info); ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; btrfs_wait_ordered_extents(root, U64_MAX, NULL); + /* + * After waiting for ordered extents run delayed iputs in order to free + * space from unlinked files before committing the current transaction, + * as ordered extents may have been holding the last reference of an + * inode and they add a delayed iput when they complete. + */ + btrfs_run_delayed_iputs(root->fs_info); + btrfs_wait_on_delayed_iputs(root->fs_info); + ret = btrfs_commit_current_transaction(root); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); @@ -4676,8 +4691,7 @@ out: * BOTH POINTERS ARE BEFORE TREE SWAP * @last_snapshot: last snapshot generation of the subvolume tree */ -int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *subvol_root, +int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, @@ -4883,17 +4897,6 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) xa_destroy(&trans->delayed_refs.dirty_extents); } -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) -{ - if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) - return; - - if (!is_fstree(root)) - return; - - btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); -} - int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta) { diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 98adf4ec7b01..e233cc79af18 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -121,11 +121,18 @@ struct btrfs_inode; #define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63) #define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62) +#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3) + /* * Record a dirty extent, and info qgroup to update quota on it */ struct btrfs_qgroup_extent_record { - u64 bytenr; + /* + * The bytenr of the extent is given by its index in the dirty_extents + * xarray of struct btrfs_delayed_ref_root left shifted by + * fs_info->sectorsize_bits. + */ + u64 num_bytes; /* @@ -343,9 +350,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); + struct btrfs_qgroup_extent_record *record, + u64 bytenr); int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord); + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr); int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, @@ -430,8 +439,7 @@ void btrfs_qgroup_init_swapped_blocks( struct btrfs_qgroup_swapped_blocks *swapped_blocks); void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); -int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *subvol_root, +int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, @@ -440,7 +448,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info); -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes); int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta); diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 4c859b550f6c..9ffc79f250fb 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -13,6 +13,39 @@ #include "volumes.h" #include "print-tree.h" +static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + const struct btrfs_key *oldkey, + u64 newlen, u64 frontpad) +{ + struct btrfs_stripe_extent *extent; + struct extent_buffer *leaf; + int slot; + size_t item_size; + struct btrfs_key newkey = { + .objectid = oldkey->objectid + frontpad, + .type = BTRFS_RAID_STRIPE_KEY, + .offset = newlen, + }; + + ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY); + + leaf = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(leaf, slot); + extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + + for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { + struct btrfs_raid_stride *stride = &extent->strides[i]; + u64 phys; + + phys = btrfs_raid_stride_physical(leaf, stride); + btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad); + } + + btrfs_set_item_key_safe(trans, path, &newkey); +} + int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -36,23 +69,24 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le while (1) { key.objectid = start; key.type = BTRFS_RAID_STRIPE_KEY; - key.offset = length; + key.offset = 0; ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); if (ret < 0) break; - if (ret > 0) { - ret = 0; - if (path->slots[0] == 0) - break; + + if (path->slots[0] == btrfs_header_nritems(path->nodes[0])) path->slots[0]--; - } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); found_start = key.objectid; found_end = found_start + key.offset; + ret = 0; + + if (key.type != BTRFS_RAID_STRIPE_KEY) + break; /* That stripe ends before we start, we're done. */ if (found_end <= start) @@ -61,7 +95,40 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le trace_btrfs_raid_extent_delete(fs_info, start, end, found_start, found_end); - ASSERT(found_start >= start && found_end <= end); + /* + * The stripe extent starts before the range we want to delete: + * + * |--- RAID Stripe Extent ---| + * |--- keep ---|--- drop ---| + * + * This means we have to duplicate the tree item, truncate the + * length to the new size and then re-insert the item. + */ + if (found_start < start) { + u64 diff = start - found_start; + + btrfs_partially_delete_raid_extent(trans, path, &key, + diff, 0); + break; + } + + /* + * The stripe extent ends after the range we want to delete: + * + * |--- RAID Stripe Extent ---| + * |--- drop ---|--- keep ---| + * + * This means we have to duplicate the tree item, truncate the + * length to the new size and then re-insert the item. + */ + if (found_end > end) { + u64 diff = found_end - end; + + btrfs_partially_delete_raid_extent(trans, path, &key, + diff, diff); + break; + } + ret = btrfs_del_item(trans, stripe_root, path); if (ret) break; @@ -108,8 +175,9 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, return ret; } -static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, - struct btrfs_io_context *bioc) +EXPORT_FOR_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key stripe_key; @@ -233,7 +301,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, found_end = found_logical + found_length; if (found_logical > end) { - ret = -ENOENT; + ret = -ENODATA; goto out; } @@ -279,10 +347,10 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, } /* If we're here, we haven't found the requested devid in the stripe. */ - ret = -ENOENT; + ret = -ENODATA; out: if (ret > 0) - ret = -ENOENT; + ret = -ENODATA; if (ret && ret != -EIO && !stripe->rst_search_commit_root) { btrfs_debug(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 1ac1c21aac2f..541836421778 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -28,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered_extent); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc); +#endif + static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, u64 map_type) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 39bec672df0c..cdd373c27784 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1272,8 +1272,7 @@ static inline void bio_list_put(struct bio_list *bio_list) static void assert_rbio(struct btrfs_raid_bio *rbio) { - if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) || - !IS_ENABLED(CONFIG_BTRFS_ASSERT)) + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f3834f8d26b4..bf267bdfa8f8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1244,7 +1244,7 @@ again: * The real subtree rescan is delayed until we have new * CoW on the subtree root node before transaction commit. */ - ret = btrfs_qgroup_add_swapped_blocks(trans, dest, + ret = btrfs_qgroup_add_swapped_blocks(dest, rc->block_group, parent, slot, path->nodes[level], path->slots[level], last_snapshot); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3a3427428074..204c928beaf9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1656,8 +1656,7 @@ static u32 stripe_length(const struct scrub_stripe *stripe) stripe->bg->start + stripe->bg->length - stripe->logical); } -static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, - struct scrub_stripe *stripe) +static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; @@ -1704,8 +1703,18 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); if (err < 0) { - set_bit(i, &stripe->io_error_bitmap); - set_bit(i, &stripe->error_bitmap); + if (err != -ENODATA) { + /* + * Earlier btrfs_get_raid_extent_offset() + * returned -ENODATA, which means there's + * no entry for the corresponding range + * in the stripe tree. But if it's in + * the extent tree, then it's a preallocated + * extent and not an error. + */ + set_bit(i, &stripe->io_error_bitmap); + set_bit(i, &stripe->error_bitmap); + } continue; } @@ -1743,7 +1752,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { - scrub_submit_extent_sector_read(sctx, stripe); + scrub_submit_extent_sector_read(stripe); return; } @@ -1954,7 +1963,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); /* - * For data stripe search, we cannot re-use the same extent/csum paths, + * For data stripe search, we cannot reuse the same extent/csum paths, * as the data stripe bytenr may be smaller than previous extent. Thus * we have to use our own extent/csum paths. */ @@ -2103,7 +2112,6 @@ out: */ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct btrfs_chunk_map *map, u64 logical_start, u64 logical_length, struct btrfs_device *device, u64 physical, int mirror_num) @@ -2222,7 +2230,7 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, * just RAID1, so we can reuse scrub_simple_mirror() to scrub * this stripe. */ - ret = scrub_simple_mirror(sctx, bg, map, cur_logical, + ret = scrub_simple_mirror(sctx, bg, cur_logical, BTRFS_STRIPE_LEN, device, cur_physical, mirror_num); if (ret) @@ -2256,7 +2264,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Offset inside the chunk */ u64 offset; u64 stripe_logical; - int stop_loop = 0; /* Extent_path should be released by now. */ ASSERT(sctx->extent_path.nodes[0] == NULL); @@ -2307,7 +2314,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * Only @physical and @mirror_num needs to calculated using * @stripe_index. */ - ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, + ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, scrub_dev, map->stripes[stripe_index].physical, stripe_index + 1); offset = 0; @@ -2362,7 +2369,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * We can reuse scrub_simple_mirror() here, as the repair part * is still based on @mirror_num. */ - ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, + ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, scrub_dev, physical, 1); if (ret < 0) goto out; @@ -2370,14 +2377,8 @@ next: logical += increment; physical += BTRFS_STRIPE_LEN; spin_lock(&sctx->stat_lock); - if (stop_loop) - sctx->stat.last_physical = - map->stripes[stripe_index].physical + dev_stripe_len; - else - sctx->stat.last_physical = physical; + sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); - if (stop_loop) - break; } out: ret2 = flush_scrub_stripes(sctx); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 27306d98ec43..7254279c3cc9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -980,9 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) return ret; } -typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, - struct fs_path *p, - void *ctx); +typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); /* * Helper function to iterate the entries in ONE btrfs_inode_ref or @@ -1007,8 +1005,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, u32 name_len; char *start; int ret = 0; - int num = 0; - int index; u64 dir; unsigned long name_off; unsigned long elem_size; @@ -1043,13 +1039,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, iref = (struct btrfs_inode_ref *)(ptr + cur); name_len = btrfs_inode_ref_name_len(eb, iref); name_off = (unsigned long)(iref + 1); - index = btrfs_inode_ref_index(eb, iref); dir = found_key->offset; } else { extref = (struct btrfs_inode_extref *)(ptr + cur); name_len = btrfs_inode_extref_name_len(eb, extref); name_off = (unsigned long)&extref->name; - index = btrfs_inode_extref_index(eb, extref); dir = btrfs_inode_extref_parent(eb, extref); } @@ -1094,10 +1088,9 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } cur += elem_size + name_len; - ret = iterate(num, dir, index, p, ctx); + ret = iterate(dir, p, ctx); if (ret) goto out; - num++; } out: @@ -1227,8 +1220,7 @@ out: return ret; } -static int __copy_first_ref(int num, u64 dir, int index, - struct fs_path *p, void *ctx) +static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx) { int ret; struct fs_path *pt = ctx; @@ -3768,7 +3760,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { - struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key di_key; @@ -3797,7 +3788,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name, + di = btrfs_match_dir_item_name(path, parent_ref->name, parent_ref->name_len); if (!di) { ret = 0; @@ -4708,8 +4699,7 @@ out: return ret; } -static int record_new_ref_if_needed(int num, u64 dir, int index, - struct fs_path *name, void *ctx) +static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret = 0; struct send_ctx *sctx = ctx; @@ -4738,8 +4728,7 @@ out: return ret; } -static int record_deleted_ref_if_needed(int num, u64 dir, int index, - struct fs_path *name, void *ctx) +static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret = 0; struct send_ctx *sctx = ctx; @@ -5677,10 +5666,11 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), disk_bytenr, disk_num_bytes, sctx->send_buf_pages + - (data_offset >> PAGE_SHIFT)); + (data_offset >> PAGE_SHIFT), + NULL); if (ret) goto out; @@ -7190,13 +7180,11 @@ static int changed_extent(struct send_ctx *sctx, static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { - int ret = 0; - if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result == BTRFS_COMPARE_TREE_NEW) sctx->cur_inode_needs_verity = true; } - return ret; + return 0; } static int dir_changed(struct send_ctx *sctx, u64 dir) @@ -8137,7 +8125,20 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); - if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started. + */ + if (btrfs_root_dead(send_root)) { + spin_unlock(&send_root->root_item_lock); + return -EPERM; + } + /* Userspace tools do the checks and warn the user if it's not RO. */ + if (!btrfs_root_readonly(send_root)) { + spin_unlock(&send_root->root_item_lock); + return -EPERM; + } + if (send_root->dedupe_in_progress) { dedupe_in_progress_warn(send_root); spin_unlock(&send_root->root_item_lock); return -EAGAIN; @@ -8146,15 +8147,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a spin_unlock(&send_root->root_item_lock); /* - * Userspace tools do the checks and warn the user if it's - * not RO. - */ - if (!btrfs_root_readonly(send_root)) { - ret = -EPERM; - goto out; - } - - /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside * access_ok. Also set an upper limit for allocation size so this can't @@ -8219,15 +8211,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a } sctx->send_root = send_root; - /* - * Unlikely but possible, if the subvolume is marked for deletion but - * is slow to remove the directory entry, send can still be started - */ - if (btrfs_root_dead(sctx->send_root)) { - ret = -EPERM; - goto out; - } - sctx->clone_roots_cnt = arg->clone_sources_count; if (sctx->proto >= 2) { diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index b07f4aa66878..9309886c5ea1 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -16,7 +16,7 @@ struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" /* Conditional support for the upcoming protocol version. */ -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL #define BTRFS_SEND_STREAM_VERSION 3 #else #define BTRFS_SEND_STREAM_VERSION 2 diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d5a9cd8a4fd8..255e85f78313 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1279,7 +1279,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * If we are freeing inodes, we want to make sure all delayed iputs have * completed, because they could have been on an inode with i_nlink == 0, and * thus have been truncated and freed up space. But again this space is not - * immediately re-usable, it comes in the form of a delayed ref, which must be + * immediately reusable, it comes in the form of a delayed ref, which must be * run and then the transaction must be committed. * * COMMIT_TRANS @@ -1488,8 +1488,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } -static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void wait_reserve_ticket(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { @@ -1547,7 +1546,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: - wait_reserve_ticket(fs_info, space_info, ticket); + wait_reserve_ticket(space_info, ticket); break; case BTRFS_RESERVE_FLUSH_LIMIT: priority_reclaim_metadata_space(fs_info, space_info, ticket, @@ -1984,8 +1983,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, int raid) +static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; int thresh_pct; @@ -2081,6 +2079,6 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) if (!btrfs_should_periodic_reclaim(space_info)) continue; for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) - do_reclaim_sweep(fs_info, space_info, raid); + do_reclaim_sweep(space_info, raid); } } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index fe4d719d506b..8c68059ac1b0 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -140,12 +140,10 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); spin_lock_init(&ret->lock); - if (type == BTRFS_SUBPAGE_METADATA) { + if (type == BTRFS_SUBPAGE_METADATA) atomic_set(&ret->eb_refs, 0); - } else { - atomic_set(&ret->readers, 0); - atomic_set(&ret->writers, 0); - } + else + atomic_set(&ret->nr_locked, 0); return ret; } @@ -221,62 +219,6 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, __start_bit; \ }) -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = len >> fs_info->sectorsize_bits; - unsigned long flags; - - - btrfs_subpage_assert(fs_info, folio, start, len); - - spin_lock_irqsave(&subpage->lock, flags); - /* - * Even though it's just for reading the page, no one should have - * locked the subpage range. - */ - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - bitmap_set(subpage->bitmaps, start_bit, nbits); - atomic_add(nbits, &subpage->readers); - spin_unlock_irqrestore(&subpage->lock, flags); -} - -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = len >> fs_info->sectorsize_bits; - unsigned long flags; - bool is_data; - bool last; - - btrfs_subpage_assert(fs_info, folio, start, len); - is_data = is_data_inode(BTRFS_I(folio->mapping->host)); - - spin_lock_irqsave(&subpage->lock, flags); - - /* The range should have already been locked. */ - ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); - ASSERT(atomic_read(&subpage->readers) >= nbits); - - bitmap_clear(subpage->bitmaps, start_bit, nbits); - last = atomic_sub_and_test(nbits, &subpage->readers); - - /* - * For data we need to unlock the page if the last read has finished. - * - * And please don't replace @last with atomic_sub_and_test() call - * inside if () condition. - * As we want the atomic_sub_and_test() to be always executed. - */ - if (is_data && last) - folio_unlock(folio); - spin_unlock_irqrestore(&subpage->lock, flags); -} - static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) { u64 orig_start = *start; @@ -295,28 +237,8 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) orig_start + orig_len) - *start; } -static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = (len >> fs_info->sectorsize_bits); - unsigned long flags; - int ret; - - btrfs_subpage_assert(fs_info, folio, start, len); - - spin_lock_irqsave(&subpage->lock, flags); - ASSERT(atomic_read(&subpage->readers) == 0); - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - bitmap_set(subpage->bitmaps, start_bit, nbits); - ret = atomic_add_return(nbits, &subpage->writers); - ASSERT(ret == nbits); - spin_unlock_irqrestore(&subpage->lock, flags); -} - -static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); @@ -334,9 +256,9 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf * extent_clear_unlock_delalloc() for compression path. * * This @locked_page is locked by plain lock_page(), thus its - * subpage::writers is 0. Handle them in a special way. + * subpage::locked is 0. Handle them in a special way. */ - if (atomic_read(&subpage->writers) == 0) { + if (atomic_read(&subpage->nr_locked) == 0) { spin_unlock_irqrestore(&subpage->lock, flags); return true; } @@ -345,40 +267,13 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf clear_bit(bit, subpage->bitmaps); cleared++; } - ASSERT(atomic_read(&subpage->writers) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->writers); + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); spin_unlock_irqrestore(&subpage->lock, flags); return last; } /* - * Lock a folio for delalloc page writeback. - * - * Return -EAGAIN if the page is not properly initialized. - * Return 0 with the page locked, and writer counter updated. - * - * Even with 0 returned, the page still need extra check to make sure - * it's really the correct page, as the caller is using - * filemap_get_folios_contig(), which can race with page invalidating. - */ -int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { - folio_lock(folio); - return 0; - } - folio_lock(folio); - if (!folio_test_private(folio) || !folio_get_private(folio)) { - folio_unlock(folio); - return -EAGAIN; - } - btrfs_subpage_clamp_range(folio, &start, &len); - btrfs_subpage_start_writer(fs_info, folio, start, len); - return 0; -} - -/* * Handle different locked folios: * * - Non-subpage folio @@ -394,8 +289,8 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, * bitmap, reduce the writer lock number, and unlock the page if that's * the last locked range. */ -void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); @@ -408,24 +303,24 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, /* * For subpage case, there are two types of locked page. With or - * without writers number. + * without locked number. * - * Since we own the page lock, no one else could touch subpage::writers + * Since we own the page lock, no one else could touch subpage::locked * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page(). */ + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } btrfs_subpage_clamp_range(folio, &start, &len); - if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) + if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len)) folio_unlock(folio); } -void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, - struct folio *folio, unsigned long bitmap) +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap) { struct btrfs_subpage *subpage = folio_get_private(folio); const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; @@ -434,13 +329,13 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, int cleared = 0; int bit; - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio->mapping)) { folio_unlock(folio); return; } - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page(). */ + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } @@ -450,8 +345,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) cleared++; } - ASSERT(atomic_read(&subpage->writers) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->writers); + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); spin_unlock_irqrestore(&subpage->lock, flags); if (last) folio_unlock(folio); @@ -776,8 +671,8 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, * This populates the involved subpage ranges so that subpage helpers can * properly unlock them. */ -void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; unsigned long flags; @@ -796,58 +691,11 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, /* Target range should not yet be locked. */ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); bitmap_set(subpage->bitmaps, start_bit, nbits); - ret = atomic_add_return(nbits, &subpage->writers); + ret = atomic_add_return(nbits, &subpage->nr_locked); ASSERT(ret <= fs_info->sectors_per_page); spin_unlock_irqrestore(&subpage->lock, flags); } -/* - * Find any subpage writer locked range inside @folio, starting at file offset - * @search_start. The caller should ensure the folio is locked. - * - * Return true and update @found_start_ret and @found_len_ret to the first - * writer locked range. - * Return false if there is no writer locked range. - */ -bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 search_start, - u64 *found_start_ret, u32 *found_len_ret) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const u32 sectors_per_page = fs_info->sectors_per_page; - const unsigned int len = PAGE_SIZE - offset_in_page(search_start); - const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, - locked, search_start, len); - const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked; - const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page; - unsigned long flags; - int first_zero; - int first_set; - bool found = false; - - ASSERT(folio_test_locked(folio)); - spin_lock_irqsave(&subpage->lock, flags); - first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit); - if (first_set >= locked_bitmap_end) - goto out; - - found = true; - - *found_start_ret = folio_pos(folio) + - ((first_set - locked_bitmap_start) << fs_info->sectorsize_bits); - /* - * Since @first_set is ensured to be smaller than locked_bitmap_end - * here, @found_start_ret should be inside the folio. - */ - ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE); - - first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set); - *found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits; -out: - spin_unlock_irqrestore(&subpage->lock, flags); - return found; -} - #define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ { \ const int sectors_per_page = fs_info->sectors_per_page; \ diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 4b85d91d0e18..428fa9389fd4 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -45,14 +45,6 @@ enum { struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; - /* - * Both data and metadata needs to track how many readers are for the - * page. - * Data relies on @readers to unlock the page when last reader finished. - * While metadata doesn't need page unlock, it needs to prevent - * page::private get cleared before the last end_page_read(). - */ - atomic_t readers; union { /* * Structures only used by metadata @@ -62,8 +54,12 @@ struct btrfs_subpage { */ atomic_t eb_refs; - /* Structures only used by data */ - atomic_t writers; + /* + * Structures only used by data, + * + * How many sectors inside the page is locked. + */ + atomic_t nr_locked; }; unsigned long bitmaps[]; }; @@ -95,23 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); - -int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, - struct folio *folio, unsigned long bitmap); -bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 search_start, - u64 *found_start_ret, u32 *found_len_ret); - +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap); /* * Template for subpage related operations. * diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 98fa0f382480..97a85d180b61 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -28,7 +28,6 @@ #include <linux/btrfs.h> #include <linux/security.h> #include <linux/fs_parser.h> -#include <linux/swap.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -340,6 +339,15 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) fallthrough; case Opt_compress: case Opt_compress_type: + /* + * Provide the same semantics as older kernels that don't use fs + * context, specifying the "compress" option clears + * "force-compress" without the need to pass + * "compress-force=[no|none]" before specifying "compress". + */ + if (opt != Opt_compress_force && opt != Opt_compress_force_type) + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + if (opt == Opt_compress || opt == Opt_compress_force) { ctx->compress_type = BTRFS_COMPRESS_ZLIB; ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; @@ -937,8 +945,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec } static int btrfs_fill_super(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - void *data) + struct btrfs_fs_devices *fs_devices) { struct inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -962,7 +969,7 @@ static int btrfs_fill_super(struct super_block *sb, return err; } - err = open_ctree(sb, fs_devices, (char *)data); + err = open_ctree(sb, fs_devices); if (err) { btrfs_err(fs_info, "open_ctree failed"); return err; @@ -1498,8 +1505,7 @@ static int btrfs_reconfigure(struct fs_context *fc) sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - if (!mount_reconfigure && - !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) + if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) return -EINVAL; ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); @@ -1885,7 +1891,7 @@ static int btrfs_get_tree_super(struct fs_context *fc) snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; - ret = btrfs_fill_super(sb, fs_devices, NULL); + ret = btrfs_fill_super(sb, fs_devices); } if (ret) { @@ -1971,25 +1977,10 @@ error: * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem * in fc->sb_flags. * - * This disambiguation has rather positive consequences. Mounting a subvolume - * ro will not also turn the superblock ro. Only the mount for the subvolume - * will become ro. - * - * So, if the superblock creation request comes from the new mount API the - * caller must have explicitly done: - * - * fsconfig(FSCONFIG_SET_FLAG, "ro") - * fsmount/mount_setattr(MOUNT_ATTR_RDONLY) - * - * IOW, at some point the caller must have explicitly turned the whole - * superblock ro and we shouldn't just undo it like we did for the old mount - * API. In any case, it lets us avoid the hack in the new mount API. - * - * Consequently, the remounting hack must only be used for requests originating - * from the old mount API and should be marked for full deprecation so it can be - * turned off in a couple of years. - * - * The new mount API has no reason to support this hack. + * But, currently the util-linux mount command already utilizes the new mount + * API and is still setting fsconfig(FSCONFIG_SET_FLAG, "ro") no matter if it's + * btrfs or not, setting the whole super block RO. To make per-subvolume mounting + * work with different options work we need to keep backward compatibility. */ static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) { @@ -2011,7 +2002,7 @@ static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) if (IS_ERR(mnt)) return mnt; - if (!fc->oldapi || !ro2rw) + if (!ro2rw) return mnt; /* We need to convert to rw, call reconfigure. */ @@ -2198,7 +2189,8 @@ static struct file_system_type btrfs_fs_type = { .init_fs_context = btrfs_init_fs_context, .parameters = btrfs_fs_parameters, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | + FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("btrfs"); @@ -2263,7 +2255,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); - ret = PTR_ERR(device); + if (IS_ERR(device)) + ret = PTR_ERR(device); + else + ret = 0; break; } ret = !(device->fs_devices->num_devices == @@ -2402,13 +2397,7 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro trace_btrfs_extent_map_shrinker_count(fs_info, nr); - /* - * Only report the real number for DEBUG builds, as there are reports of - * serious performance degradation caused by too frequent shrinks. - */ - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) - return nr; - return 0; + return nr; } static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) @@ -2416,16 +2405,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); struct btrfs_fs_info *fs_info = btrfs_sb(sb); - /* - * We may be called from any task trying to allocate memory and we don't - * want to slow it down with scanning and dropping extent maps. It would - * also cause heavy lock contention if many tasks concurrently enter - * here. Therefore only allow kswapd tasks to scan and drop extent maps. - */ - if (!current_is_kswapd()) - return 0; + btrfs_free_extent_maps(fs_info, nr_to_scan); - return btrfs_free_extent_maps(fs_info, nr_to_scan); + /* The extent map shrinker runs asynchronously, so always return 0. */ + return 0; } static const struct super_operations btrfs_super_ops = { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 03926ad467c9..b843308e2bc6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1390,7 +1390,7 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL static ssize_t btrfs_offload_csum_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1450,7 +1450,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_ATTR_PTR(, offload_csum), #endif NULL, diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ce50847e1e01..e607b5d52fb1 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -29,6 +29,7 @@ const char *test_error[] = { [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", + [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context", }; static const struct super_operations btrfs_test_super_ops = { @@ -291,6 +292,9 @@ int btrfs_run_sanity_tests(void) ret = btrfs_test_free_space_tree(sectorsize, nodesize); if (ret) goto out; + ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize); + if (ret) + goto out; } } ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index dc2f2ab15fa5..b524ecf2f452 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -24,6 +24,7 @@ enum { TEST_ALLOC_BLOCK_GROUP, TEST_ALLOC_EXTENT_MAP, TEST_ALLOC_CHUNK_MAP, + TEST_ALLOC_IO_CONTEXT, }; extern const char *test_error[]; @@ -37,6 +38,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); int btrfs_test_extent_map(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c new file mode 100644 index 000000000000..30f17eb7b6a8 --- /dev/null +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -0,0 +1,538 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Western Digital Corporation or its affiliates. + */ + +#include <linux/sizes.h> +#include "../fs.h" +#include "../disk-io.h" +#include "../transaction.h" +#include "../volumes.h" +#include "../raid-stripe-tree.h" +#include "btrfs-tests.h" + +#define RST_TEST_NUM_DEVICES (2) +#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1) + +typedef int (*test_func_t)(struct btrfs_trans_handle *trans); + +static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices, + u64 devid) +{ + struct btrfs_device *dev; + + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (dev->devid == devid) + return dev; + } + + return NULL; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * delete the 1st 32K, making the new start address 1M+32K. + */ +static int test_front_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, SZ_32K); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + SZ_32K); + goto out; + } + + len = SZ_32K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len, + map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical + SZ_32K, logical + SZ_32K + len); + goto out; + } + + if (io_stripe.physical != logical + SZ_32K) { + test_err("invalid physical address, expected %llu, got %llu", + logical + SZ_32K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_32K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_32K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (!ret) { + ret = -EINVAL; + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical, logical + SZ_32K); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * truncate the stripe extent down to 32K. + */ +static int test_tail_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical + SZ_32K, logical + SZ_64K); + goto out; + } + + len = SZ_32K; + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu, got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_32K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_32K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * overwrite the whole range giving it new physical address at an offset of 1G. + * The intent of this test is to exercise the 'update_raid_extent_item()' + * function called be btrfs_insert_one_raid_extent(). + */ +static int test_create_update_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = SZ_1G + logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("updating RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical + SZ_1G) { + test_err("invalid physical address, expected %llu, got %llu", + logical + SZ_1G, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M. + * The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M. + */ +static int test_simple_create_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + bioc->map_type = map_type; + bioc->size = SZ_64K; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +static const test_func_t tests[] = { + test_simple_create_delete, + test_create_update_delete, + test_tail_delete, + test_front_delete, +}; + +static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root = NULL; + int ret; + + fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + ret = -ENOMEM; + goto out; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); + root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + fs_info->stripe_root = root; + root->fs_info->tree_root = root; + + root->node = alloc_test_extent_buffer(root->fs_info, nodesize); + if (IS_ERR(root->node)) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = PTR_ERR(root->node); + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 2 * nodesize; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_device *dev; + + dev = btrfs_alloc_dummy_device(fs_info); + if (IS_ERR(dev)) { + test_err("cannot allocate device"); + ret = PTR_ERR(dev); + goto out; + } + dev->devid = i; + } + + btrfs_init_dummy_trans(&trans, root->fs_info); + ret = test(&trans); + if (ret) + goto out; + +out: + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + + return ret; +} + +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize) +{ + int ret = 0; + + test_msg("running raid-stripe-tree tests"); + for (int i = 0; i < ARRAY_SIZE(tests); i++) { + ret = run_test(tests[i], sectorsize, nodesize); + if (ret) { + test_err("test-case %ps failed with %d\n", tests[i], ret); + goto out; + } + } + +out: + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0fc873af891f..dc0b837efd5d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -141,8 +141,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); - WARN_ON(!RB_EMPTY_ROOT( - &transaction->delayed_refs.href_root.rb_root)); + WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs)); WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, @@ -349,9 +348,8 @@ loop: memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); - cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; + xa_init(&cur_trans->delayed_refs.head_refs); xa_init(&cur_trans->delayed_refs.dirty_extents); - atomic_set(&cur_trans->delayed_refs.num_entries, 0); /* * although the tree mod log is per file system and not per transaction, @@ -2052,7 +2050,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) spin_unlock(&fs_info->trans_lock); - btrfs_cleanup_one_transaction(trans->transaction, fs_info); + btrfs_cleanup_one_transaction(trans->transaction); spin_lock(&fs_info->trans_lock); if (cur_trans == fs_info->running_transaction) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index dd9ce9b9f69e..184fa5c0062a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -33,7 +33,7 @@ struct btrfs_path; */ #define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) -/* Radix-tree tag for roots that are part of the trasaction. */ +/* Radix-tree tag for roots that are part of the transaction. */ #define BTRFS_ROOT_TRANS_TAG 0 enum btrfs_trans_state { diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7b50263723bc..148d8cefa40e 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -2183,8 +2183,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) return 0; } -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) +int btrfs_verify_level_key(struct extent_buffer *eb, + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int found_level; @@ -2192,16 +2192,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, int ret; found_level = btrfs_header_level(eb); - if (found_level != level) { + if (found_level != check->level) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree level check failed\n"); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", - eb->start, level, found_level); + eb->start, check->level, found_level); return -EIO; } - if (!first_key) + if (!check->has_first_key) return 0; /* @@ -2226,15 +2226,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(first_key, &found_key); + ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); if (ret) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree first key check failed\n"); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", - eb->start, parent_transid, first_key->objectid, - first_key->type, first_key->offset, + eb->start, check->transid, check->first_key.objectid, + check->first_key.type, check->first_key.offset, found_key.objectid, found_key.type, found_key.offset); } diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 01669cfa6578..db67f96cbe4b 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -69,7 +69,7 @@ int btrfs_check_node(struct extent_buffer *node); int btrfs_check_chunk_valid(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid); +int btrfs_verify_level_key(struct extent_buffer *eb, + const struct btrfs_tree_parent_check *check); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e2ed2a791f8f..c8d6587688b3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1374,7 +1374,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; int ret; int log_ref_ver = 0; u64 parent_objectid; @@ -1845,7 +1845,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -2125,7 +2125,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct inode *inode = NULL; struct btrfs_key location; @@ -6204,7 +6204,6 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, const struct list_head *delayed_del_list, const struct btrfs_delayed_item *first, const struct btrfs_delayed_item **last_ret) @@ -6265,7 +6264,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, if (ret < 0) { return ret; } else if (ret == 0) { - ret = batch_delete_dir_index_items(trans, inode, path, ctx, + ret = batch_delete_dir_index_items(trans, inode, path, delayed_del_list, curr, &last); if (ret) diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index b382a4c443d4..1ac2678fc4ca 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -909,7 +909,6 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, * is freed (its refcount is decremented). */ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, struct extent_buffer *eb, u64 time_seq) { diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 6308c577a4a4..1c12566040db 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -41,7 +41,6 @@ int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, struct extent_buffer *eb, u64 time_seq); struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8f340ad1d938..1cccaf9c2b0d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -733,6 +733,114 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) } /* + * We can have very weird soft links passed in. + * One example is "/proc/self/fd/<fd>", which can be a soft link to + * a block device. + * + * But it's never a good idea to use those weird names. + * Here we check if the path (not following symlinks) is a good one inside + * "/dev/". + */ +static bool is_good_dev_path(const char *dev_path) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + bool is_good = false; + int ret; + + if (!dev_path) + goto out; + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) + goto out; + + /* + * Do not follow soft link, just check if the original path is inside + * "/dev/". + */ + ret = kern_path(dev_path, 0, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + if (IS_ERR(resolved_path)) + goto out; + if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) + goto out; + is_good = true; +out: + kfree(path_buf); + path_put(&path); + return is_good; +} + +static int get_canonical_dev_path(const char *dev_path, char *canonical) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + int ret; + + if (!dev_path) { + ret = -EINVAL; + goto out; + } + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) { + ret = -ENOMEM; + goto out; + } + + ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + ret = strscpy(canonical, resolved_path, PATH_MAX); +out: + kfree(path_buf); + path_put(&path); + return ret; +} + +static bool is_same_device(struct btrfs_device *device, const char *new_path) +{ + struct path old = { .mnt = NULL, .dentry = NULL }; + struct path new = { .mnt = NULL, .dentry = NULL }; + char *old_path = NULL; + bool is_same = false; + int ret; + + if (!device->name) + goto out; + + old_path = kzalloc(PATH_MAX, GFP_NOFS); + if (!old_path) + goto out; + + rcu_read_lock(); + ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); + rcu_read_unlock(); + if (ret < 0) + goto out; + + ret = kern_path(old_path, LOOKUP_FOLLOW, &old); + if (ret) + goto out; + ret = kern_path(new_path, LOOKUP_FOLLOW, &new); + if (ret) + goto out; + if (path_equal(&old, &new)) + is_same = true; +out: + kfree(old_path); + path_put(&old); + path_put(&new); + return is_same; +} + +/* * Add new device to list of registered devices * * Returns: @@ -852,7 +960,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); - } else if (!device->name || strcmp(device->name->str, path)) { + } else if (!device->name || !is_same_device(device, path)) { /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that @@ -1105,6 +1213,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->bdev) { fs_devices->open_devices--; device->bdev = NULL; + device->bdev_file = NULL; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); btrfs_destroy_dev_zone_info(device); @@ -1382,12 +1491,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; + char *canonical_path = NULL; u64 bytenr; dev_t devt; int ret; lockdep_assert_held(&uuid_mutex); + if (!is_good_dev_path(path)) { + canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (canonical_path) { + ret = get_canonical_dev_path(path, canonical_path); + if (ret < 0) { + kfree(canonical_path); + canonical_path = NULL; + } + } + } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1432,7 +1552,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, goto free_disk_super; } - device = device_list_add(path, disk_super, &new_device_added); + device = device_list_add(canonical_path ? : path, disk_super, + &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); @@ -1441,6 +1562,7 @@ free_disk_super: error_bdev_put: fput(bdev_file); + kfree(canonical_path); return device; } @@ -2720,8 +2842,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { - btrfs_clear_sb_rdonly(sb); - /* GFP_KERNEL allocation must not be under device_list_mutex */ seed_devices = btrfs_init_sprout(fs_info); if (IS_ERR(seed_devices)) { @@ -2864,8 +2984,6 @@ error_sysfs: mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: - if (seeding_dev) - btrfs_set_sb_rdonly(sb); if (trans) btrfs_end_transaction(trans); error_free_zone: @@ -5309,7 +5427,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; - /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, ctl->stripe_size) + ctl->nparity, @@ -5841,24 +5959,6 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) -{ - struct btrfs_chunk_map *map; - int ret = 0; - - if (!btrfs_fs_incompat(fs_info, RAID56)) - return 0; - - map = btrfs_get_chunk_map(fs_info, logical, len); - - if (!WARN_ON(IS_ERR(map))) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - ret = 1; - btrfs_free_chunk_map(map); - } - return ret; -} - static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) @@ -5919,9 +6019,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, - u64 logical, - u16 total_stripes) +EXPORT_FOR_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; @@ -6480,13 +6580,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); - down_read(&dev_replace->rwsem); + if (dev_replace->replace_task != current) + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); /* * Hold the semaphore for read during the whole operation, write is * requested at commit time but must wait. */ - if (!dev_replace_is_ongoing) + if (!dev_replace_is_ongoing && dev_replace->replace_task != current) up_read(&dev_replace->rwsem); switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { @@ -6626,7 +6728,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, bioc->mirror_num = io_geom.mirror_num; out: - if (dev_replace_is_ongoing) { + if (dev_replace_is_ongoing && dev_replace->replace_task != current) { lockdep_assert_held(&dev_replace->rwsem); /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4481575dd70f..3a416b1bc24c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -306,7 +306,7 @@ enum btrfs_read_policy { BTRFS_NR_READ_POLICY, }; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Checksum mode - offload it to workqueues or do it synchronously in * btrfs_submit_chunk(). @@ -430,7 +430,7 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif @@ -741,8 +741,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, - u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); @@ -840,4 +838,9 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes); +#endif + #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ce464cd8e0ac..bc18710d1dcf 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -85,7 +85,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; size_t name_len = strlen(name); int ret = 0; @@ -143,14 +142,14 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, */ ret = 0; btrfs_assert_tree_write_locked(path->nodes[0]); - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) { ret = -ENOSPC; goto out; } } else if (ret == -EEXIST) { ret = 0; - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); ASSERT(di); /* logic error */ } else if (ret) { goto out; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 100abc00b794..ddf0d5a448a7 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, pg_off = offset_in_page(start); cur_len = btrfs_calc_input_length(orig_end, start); data_in = kmap_local_folio(in_folio, pg_off); - start += PAGE_SIZE; + start += cur_len; workspace->strm.next_in = data_in; workspace->strm.avail_in = cur_len; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 69d03feea4e0..11ed523e528e 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -707,11 +707,14 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) * zoned mode. In this case, we don't have a valid max zone * append size. */ - if (bdev_is_zoned(device->bdev)) { - blk_stack_limits(lim, - &bdev_get_queue(device->bdev)->limits, - 0); - } + if (bdev_is_zoned(device->bdev)) + blk_stack_limits(lim, bdev_limits(device->bdev), 0); + } + + ret = blk_validate_limits(lim); + if (ret) { + btrfs_err(fs_info, "zoned: failed to validate queue limits"); + return ret; } /* @@ -1739,7 +1742,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) return false; /* - * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the * extent layout the relocation code has. * Furthermore we have set aside own block-group from which only the * relocation "process" can allocate and make sure only one process at a @@ -1973,7 +1976,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (block_group->meta_write_pointer > eb->start) return -EBUSY; - /* If for_sync, this hole will be filled with trasnsaction commit. */ + /* If for_sync, this hole will be filled with transaction commit. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) return -EAGAIN; return -EBUSY; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 866607fd3e58..5232b56d5892 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -111,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; + ASSERT(timer == &wsm.timer); + spin_lock(&wsm.lock); if (list_empty(&wsm.lru_list)) { @@ -495,7 +497,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { - tot_in += PAGE_SIZE; + tot_in += workspace->in_buf.size; kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); diff --git a/fs/buffer.c b/fs/buffer.c index 1fc9a50def0b..bb4a31b9559d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1649,6 +1649,7 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length) if (length == folio_size(folio)) filemap_release_folio(folio, 0); out: + folio_clear_mappedtodisk(folio); return; } EXPORT_SYMBOL(block_invalidate_folio); @@ -2803,7 +2804,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_write_hint = write_hint; - __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh)); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; @@ -2813,7 +2814,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); + wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size); } submit_bio(bio); diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 35ba2117a6f6..3e63cfe15874 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -327,6 +327,8 @@ static void cachefiles_commit_object(struct cachefiles_object *object, static void cachefiles_clean_up_object(struct cachefiles_object *object, struct cachefiles_cache *cache) { + struct file *file; + if (test_bit(FSCACHE_COOKIE_RETIRED, &object->cookie->flags)) { if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { cachefiles_see_object(object, cachefiles_obj_see_clean_delete); @@ -342,10 +344,14 @@ static void cachefiles_clean_up_object(struct cachefiles_object *object, } cachefiles_unmark_inode_in_use(object, object->file); - if (object->file) { - fput(object->file); - object->file = NULL; - } + + spin_lock(&object->lock); + file = object->file; + object->file = NULL; + spin_unlock(&object->lock); + + if (file) + fput(file); } /* diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 2b3f9935dbb4..7cf59713f0f7 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -691,11 +691,6 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, } if (!d_is_negative(dentry)) { - if (d_backing_inode(dentry) == file_inode(object->file)) { - success = true; - goto out_dput; - } - ret = cachefiles_unlink(volume->cache, object, fan, dentry, FSCACHE_OBJECT_IS_STALE); if (ret < 0) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 470c96658385..fe3de9ad57bf 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -60,26 +60,36 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, { struct cachefiles_object *object = kiocb->ki_filp->private_data; struct cachefiles_cache *cache = object->volume->cache; - struct file *file = object->file; - size_t len = iter->count; + struct file *file; + size_t len = iter->count, aligned_len = len; loff_t pos = kiocb->ki_pos; const struct cred *saved_cred; int ret; - if (!file) + spin_lock(&object->lock); + file = object->file; + if (!file) { + spin_unlock(&object->lock); return -ENOBUFS; + } + get_file(file); + spin_unlock(&object->lock); cachefiles_begin_secure(cache, &saved_cred); - ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true); + ret = __cachefiles_prepare_write(object, file, &pos, &aligned_len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) - return ret; + goto out; trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len); ret = __cachefiles_write(object, file, pos, iter, NULL, NULL); - if (!ret) + if (!ret) { ret = len; + kiocb->ki_pos += ret; + } +out: + fput(file); return ret; } @@ -87,12 +97,22 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos, int whence) { struct cachefiles_object *object = filp->private_data; - struct file *file = object->file; + struct file *file; + loff_t ret; - if (!file) + spin_lock(&object->lock); + file = object->file; + if (!file) { + spin_unlock(&object->lock); return -ENOBUFS; + } + get_file(file); + spin_unlock(&object->lock); - return vfs_llseek(file, pos, whence); + ret = vfs_llseek(file, pos, whence); + fput(file); + + return ret; } static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c2a9e2cc03de..4c82348fe1e6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1054,7 +1054,9 @@ get_more_pages: if (!nr_folios && !locked_pages) break; for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { - page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; + + page = &folio->page; doutc(cl, "? %p idx %lu\n", page, page->index); if (locked_pages == 0) lock_page(page); /* first page */ @@ -1081,8 +1083,6 @@ get_more_pages: continue; } if (page_offset(page) >= ceph_wbc.i_size) { - struct folio *folio = page_folio(page); - doutc(cl, "folio at %lu beyond eof %llu\n", folio->index, ceph_wbc.i_size); if ((ceph_wbc.size_stable || @@ -1098,16 +1098,16 @@ get_more_pages: unlock_page(page); break; } - if (PageWriteback(page) || - PagePrivate2(page) /* [DEPRECATED] */) { + if (folio_test_writeback(folio) || + folio_test_private_2(folio) /* [DEPRECATED] */) { if (wbc->sync_mode == WB_SYNC_NONE) { - doutc(cl, "%p under writeback\n", page); - unlock_page(page); + doutc(cl, "%p under writeback\n", folio); + folio_unlock(folio); continue; } - doutc(cl, "waiting on writeback %p\n", page); - wait_on_page_writeback(page); - folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ + doutc(cl, "waiting on writeback %p\n", folio); + folio_wait_writeback(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ } if (!clear_page_dirty_for_io(page)) { diff --git a/fs/char_dev.c b/fs/char_dev.c index 57cc096c498a..c2ddb998f3c9 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -562,8 +562,8 @@ int cdev_device_add(struct cdev *cdev, struct device *dev) /** * cdev_device_del() - inverse of cdev_device_add - * @dev: the device structure * @cdev: the cdev structure + * @dev: the device structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 8f0af4f62631..d5ef5469e4e6 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -80,6 +80,16 @@ #define ELF_HWCAP2 COMPAT_ELF_HWCAP2 #endif +#ifdef COMPAT_ELF_HWCAP3 +#undef ELF_HWCAP3 +#define ELF_HWCAP3 COMPAT_ELF_HWCAP3 +#endif + +#ifdef COMPAT_ELF_HWCAP4 +#undef ELF_HWCAP4 +#define ELF_HWCAP4 COMPAT_ELF_HWCAP4 +#endif + #ifdef COMPAT_ARCH_DLINFO #undef ARCH_DLINFO #define ARCH_DLINFO COMPAT_ARCH_DLINFO diff --git a/fs/coredump.c b/fs/coredump.c index 45737b43dda5..d48edb37bc35 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -951,6 +951,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, } else { dump_skip(cprm, PAGE_SIZE); } + cond_resched(); } dump_page_free(dump_page); return 1; diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 206835e31efa..787e9c8938ba 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -22,6 +22,7 @@ #include <crypto/skcipher.h> #include <linux/key-type.h> #include <linux/random.h> +#include <linux/once.h> #include <linux/seq_file.h> #include "fscrypt_private.h" @@ -1262,35 +1262,46 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); + loff_t copy_pos = iter->pos; + u64 copy_len = iomap_length(iter); + u32 mod; int id = 0; s64 ret = 0; void *daddr = NULL, *saddr = NULL; - /* don't bother with blocks that are not shared to start with */ - if (!(iomap->flags & IOMAP_F_SHARED)) - return length; + if (!iomap_want_unshare_iter(iter)) + return iomap_length(iter); + + /* + * Extend the file range to be aligned to fsblock/pagesize, because + * we need to copy entire blocks, not just the byte range specified. + * Invalidate the mapping because we're about to CoW. + */ + mod = offset_in_page(copy_pos); + if (mod) { + copy_len += mod; + copy_pos -= mod; + } + + mod = offset_in_page(copy_pos + copy_len); + if (mod) + copy_len += PAGE_SIZE - mod; + + invalidate_inode_pages2_range(iter->inode->i_mapping, + copy_pos >> PAGE_SHIFT, + (copy_pos + copy_len - 1) >> PAGE_SHIFT); id = dax_read_lock(); - ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); + ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL); if (ret < 0) goto out_unlock; - /* zero the distance if srcmap is HOLE or UNWRITTEN */ - if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) { - memset(daddr, 0, length); - dax_flush(iomap->dax_dev, daddr, length); - ret = length; - goto out_unlock; - } - - ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); + ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL); if (ret < 0) goto out_unlock; - if (copy_mc_to_kernel(daddr, saddr, length) == 0) - ret = length; + if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0) + ret = iomap_length(iter); else ret = -EIO; diff --git a/fs/dcache.c b/fs/dcache.c index 0f6b16ba30d0..0099077a2982 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -135,6 +135,7 @@ struct dentry_stat_t { static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); +static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ @@ -199,6 +200,15 @@ static struct ctl_table fs_dcache_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_dentry, }, + { + .procname = "dentry-negative", + .data = &dentry_negative_policy, + .maxlen = sizeof(dentry_negative_policy), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; static int __init init_fs_dcache_sysctls(void) @@ -2039,8 +2049,8 @@ EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name - * @inode: the inode case-insensitive lookup has found * @dentry: the negative dentry that was passed to the parent's lookup func + * @inode: the inode case-insensitive lookup has found * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the @@ -2093,8 +2103,8 @@ EXPORT_SYMBOL(d_add_ci); /** * d_same_name - compare dentry name with case-exact name - * @parent: parent dentry * @dentry: the negative dentry that was passed to the parent's lookup func + * @parent: parent dentry * @name: the case-exact name to be associated with the returned dentry * * Return: true if names are same, or false @@ -2401,6 +2411,8 @@ void d_delete(struct dentry * dentry) * Are we the only user? */ if (dentry->d_lockref.count == 1) { + if (dentry_negative_policy) + __d_drop(dentry); dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 827278525fd9..69536cacdea8 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -328,10 +328,10 @@ out: * Convert an eCryptfs page index into a lower byte offset */ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat, - struct page *page) + struct folio *folio) { return ecryptfs_lower_header_size(crypt_stat) + - ((loff_t)page->index << PAGE_SHIFT); + (loff_t)folio->index * PAGE_SIZE; } /** @@ -340,6 +340,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat, * encryption operation * @dst_page: The page to write the result into * @src_page: The page to read from + * @page_index: The offset in the file (in units of PAGE_SIZE) * @extent_offset: Page extent offset for use in generating IV * @op: ENCRYPT or DECRYPT to indicate the desired operation * @@ -350,9 +351,9 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat, static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat, struct page *dst_page, struct page *src_page, + pgoff_t page_index, unsigned long extent_offset, int op) { - pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index; loff_t extent_base; char extent_iv[ECRYPTFS_MAX_IV_BYTES]; struct scatterlist src_sg, dst_sg; @@ -392,7 +393,7 @@ out: /** * ecryptfs_encrypt_page - * @page: Page mapped from the eCryptfs inode for the file; contains + * @folio: Folio mapped from the eCryptfs inode for the file; contains * decrypted content that needs to be encrypted (to a temporary * page; not in place) and written out to the lower file * @@ -406,7 +407,7 @@ out: * * Returns zero on success; negative on error */ -int ecryptfs_encrypt_page(struct page *page) +int ecryptfs_encrypt_page(struct folio *folio) { struct inode *ecryptfs_inode; struct ecryptfs_crypt_stat *crypt_stat; @@ -416,7 +417,7 @@ int ecryptfs_encrypt_page(struct page *page) loff_t lower_offset; int rc = 0; - ecryptfs_inode = page->mapping->host; + ecryptfs_inode = folio->mapping->host; crypt_stat = &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); @@ -431,8 +432,9 @@ int ecryptfs_encrypt_page(struct page *page) for (extent_offset = 0; extent_offset < (PAGE_SIZE / crypt_stat->extent_size); extent_offset++) { - rc = crypt_extent(crypt_stat, enc_extent_page, page, - extent_offset, ENCRYPT); + rc = crypt_extent(crypt_stat, enc_extent_page, + folio_page(folio, 0), folio->index, + extent_offset, ENCRYPT); if (rc) { printk(KERN_ERR "%s: Error encrypting extent; " "rc = [%d]\n", __func__, rc); @@ -440,7 +442,7 @@ int ecryptfs_encrypt_page(struct page *page) } } - lower_offset = lower_offset_for_page(crypt_stat, page); + lower_offset = lower_offset_for_page(crypt_stat, folio); enc_extent_virt = kmap_local_page(enc_extent_page); rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset, PAGE_SIZE); @@ -461,7 +463,7 @@ out: /** * ecryptfs_decrypt_page - * @page: Page mapped from the eCryptfs inode for the file; data read + * @folio: Folio mapped from the eCryptfs inode for the file; data read * and decrypted from the lower file will be written into this * page * @@ -475,7 +477,7 @@ out: * * Returns zero on success; negative on error */ -int ecryptfs_decrypt_page(struct page *page) +int ecryptfs_decrypt_page(struct folio *folio) { struct inode *ecryptfs_inode; struct ecryptfs_crypt_stat *crypt_stat; @@ -484,13 +486,13 @@ int ecryptfs_decrypt_page(struct page *page) loff_t lower_offset; int rc = 0; - ecryptfs_inode = page->mapping->host; + ecryptfs_inode = folio->mapping->host; crypt_stat = &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); - lower_offset = lower_offset_for_page(crypt_stat, page); - page_virt = kmap_local_page(page); + lower_offset = lower_offset_for_page(crypt_stat, folio); + page_virt = kmap_local_folio(folio, 0); rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE, ecryptfs_inode); kunmap_local(page_virt); @@ -504,8 +506,9 @@ int ecryptfs_decrypt_page(struct page *page) for (extent_offset = 0; extent_offset < (PAGE_SIZE / crypt_stat->extent_size); extent_offset++) { - rc = crypt_extent(crypt_stat, page, page, - extent_offset, DECRYPT); + struct page *page = folio_page(folio, 0); + rc = crypt_extent(crypt_stat, page, page, folio->index, + extent_offset, DECRYPT); if (rc) { printk(KERN_ERR "%s: Error decrypting extent; " "rc = [%d]\n", __func__, rc); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index c586c5db18b5..1f562e75d0e4 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -569,8 +569,8 @@ void ecryptfs_destroy_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat); int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); -int ecryptfs_encrypt_page(struct page *page); -int ecryptfs_decrypt_page(struct page *page); +int ecryptfs_encrypt_page(struct folio *folio); +int ecryptfs_decrypt_page(struct folio *folio); int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, struct inode *ecryptfs_inode); int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); @@ -653,16 +653,15 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size); int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, - struct page *page_for_lower, + struct folio *folio_for_lower, size_t offset_in_page, size_t size); int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size); int ecryptfs_read_lower(char *data, loff_t offset, size_t size, struct inode *ecryptfs_inode); -int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, +int ecryptfs_read_lower_page_segment(struct folio *folio_for_ecryptfs, pgoff_t page_index, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode); -struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, size_t *length_size); int ecryptfs_write_packet_length(char *dest, size_t size, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index cbdf82f0183f..a9819ddb1ab8 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1008,14 +1008,6 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap, return rc; } -static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) -{ - if (flags & AT_GETATTR_NOSEC) - return vfs_getattr_nosec(path, stat, request_mask, flags); - return vfs_getattr(path, stat, request_mask, flags); -} - static int ecryptfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) @@ -1024,8 +1016,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap, struct kstat lower_stat; int rc; - rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry), - &lower_stat, request_mask, flags); + rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry), + &lower_stat, request_mask, flags); if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index ceda5555971a..60f0ac8744b5 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -23,47 +23,29 @@ #include "ecryptfs_kernel.h" /* - * ecryptfs_get_locked_page - * - * Get one page from cache or lower f/s, return error otherwise. - * - * Returns locked and up-to-date page (if ok), with increased - * refcnt. - */ -struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index) -{ - struct page *page = read_mapping_page(inode->i_mapping, index, NULL); - if (!IS_ERR(page)) - lock_page(page); - return page; -} - -/** - * ecryptfs_writepage - * @page: Page that is locked before this call is made - * @wbc: Write-back control structure - * - * Returns zero on success; non-zero otherwise - * * This is where we encrypt the data and pass the encrypted data to * the lower filesystem. In OpenPGP-compatible mode, we operate on * entire underlying packets. */ -static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) +static int ecryptfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - int rc; - - rc = ecryptfs_encrypt_page(page); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error encrypting " - "page (upper index [0x%.16lx])\n", page->index); - ClearPageUptodate(page); - goto out; + struct folio *folio = NULL; + int error; + + while ((folio = writeback_iter(mapping, wbc, folio, &error))) { + error = ecryptfs_encrypt_page(folio); + if (error) { + ecryptfs_printk(KERN_WARNING, + "Error encrypting folio (index [0x%.16lx])\n", + folio->index); + folio_clear_uptodate(folio); + mapping_set_error(mapping, error); + } + folio_unlock(folio); } - SetPageUptodate(page); -out: - unlock_page(page); - return rc; + + return error; } static void strip_xattr_flag(char *page_virt, @@ -97,7 +79,7 @@ static void strip_xattr_flag(char *page_virt, /** * ecryptfs_copy_up_encrypted_with_header - * @page: Sort of a ``virtual'' representation of the encrypted lower + * @folio: Sort of a ``virtual'' representation of the encrypted lower * file. The actual lower file does not have the metadata in * the header. This is locked. * @crypt_stat: The eCryptfs inode's cryptographic context @@ -106,7 +88,7 @@ static void strip_xattr_flag(char *page_virt, * seeing, with the header information inserted. */ static int -ecryptfs_copy_up_encrypted_with_header(struct page *page, +ecryptfs_copy_up_encrypted_with_header(struct folio *folio, struct ecryptfs_crypt_stat *crypt_stat) { loff_t extent_num_in_page = 0; @@ -115,9 +97,9 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, int rc = 0; while (extent_num_in_page < num_extents_per_page) { - loff_t view_extent_num = ((((loff_t)page->index) + loff_t view_extent_num = ((loff_t)folio->index * num_extents_per_page) - + extent_num_in_page); + + extent_num_in_page; size_t num_header_extents_at_front = (crypt_stat->metadata_size / crypt_stat->extent_size); @@ -125,21 +107,21 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, /* This is a header extent */ char *page_virt; - page_virt = kmap_local_page(page); + page_virt = kmap_local_folio(folio, 0); memset(page_virt, 0, PAGE_SIZE); /* TODO: Support more than one header extent */ if (view_extent_num == 0) { size_t written; rc = ecryptfs_read_xattr_region( - page_virt, page->mapping->host); + page_virt, folio->mapping->host); strip_xattr_flag(page_virt + 16, crypt_stat); ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written); } kunmap_local(page_virt); - flush_dcache_page(page); + flush_dcache_folio(folio); if (rc) { printk(KERN_ERR "%s: Error reading xattr " "region; rc = [%d]\n", __func__, rc); @@ -152,9 +134,9 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, - crypt_stat->metadata_size); rc = ecryptfs_read_lower_page_segment( - page, (lower_offset >> PAGE_SHIFT), + folio, (lower_offset >> PAGE_SHIFT), (lower_offset & ~PAGE_MASK), - crypt_stat->extent_size, page->mapping->host); + crypt_stat->extent_size, folio->mapping->host); if (rc) { printk(KERN_ERR "%s: Error attempting to read " "extent at offset [%lld] in the lower " @@ -180,55 +162,50 @@ out: */ static int ecryptfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; + struct inode *inode = folio->mapping->host; struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat; - int rc = 0; + &ecryptfs_inode_to_private(inode)->crypt_stat; + int err = 0; if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = ecryptfs_read_lower_page_segment(page, page->index, 0, - PAGE_SIZE, - page->mapping->host); + err = ecryptfs_read_lower_page_segment(folio, folio->index, 0, + folio_size(folio), inode); } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { - rc = ecryptfs_copy_up_encrypted_with_header(page, - crypt_stat); - if (rc) { + err = ecryptfs_copy_up_encrypted_with_header(folio, + crypt_stat); + if (err) { printk(KERN_ERR "%s: Error attempting to copy " "the encrypted content from the lower " "file whilst inserting the metadata " - "from the xattr into the header; rc = " - "[%d]\n", __func__, rc); + "from the xattr into the header; err = " + "[%d]\n", __func__, err); goto out; } } else { - rc = ecryptfs_read_lower_page_segment( - page, page->index, 0, PAGE_SIZE, - page->mapping->host); - if (rc) { - printk(KERN_ERR "Error reading page; rc = " - "[%d]\n", rc); + err = ecryptfs_read_lower_page_segment(folio, + folio->index, 0, folio_size(folio), + inode); + if (err) { + printk(KERN_ERR "Error reading page; err = " + "[%d]\n", err); goto out; } } } else { - rc = ecryptfs_decrypt_page(page); - if (rc) { + err = ecryptfs_decrypt_page(folio); + if (err) { ecryptfs_printk(KERN_ERR, "Error decrypting page; " - "rc = [%d]\n", rc); + "err = [%d]\n", err); goto out; } } out: - if (rc) - ClearPageUptodate(page); - else - SetPageUptodate(page); - ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n", - page->index); - unlock_page(page); - return rc; + ecryptfs_printk(KERN_DEBUG, "Unlocking folio with index = [0x%.16lx]\n", + folio->index); + folio_end_read(folio, err == 0); + return err; } /* @@ -285,7 +262,7 @@ static int ecryptfs_write_begin(struct file *file, if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_read_lower_page_segment( - &folio->page, index, 0, PAGE_SIZE, mapping->host); + folio, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error attempting to read " "lower page segment; rc = [%d]\n", @@ -297,7 +274,7 @@ static int ecryptfs_write_begin(struct file *file, } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { rc = ecryptfs_copy_up_encrypted_with_header( - &folio->page, crypt_stat); + folio, crypt_stat); if (rc) { printk(KERN_ERR "%s: Error attempting " "to copy the encrypted content " @@ -311,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file, folio_mark_uptodate(folio); } else { rc = ecryptfs_read_lower_page_segment( - &folio->page, index, 0, PAGE_SIZE, + folio, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error reading " @@ -328,7 +305,7 @@ static int ecryptfs_write_begin(struct file *file, folio_zero_range(folio, 0, PAGE_SIZE); folio_mark_uptodate(folio); } else if (len < PAGE_SIZE) { - rc = ecryptfs_decrypt_page(&folio->page); + rc = ecryptfs_decrypt_page(folio); if (rc) { printk(KERN_ERR "%s: Error decrypting " "page at index [%ld]; " @@ -477,7 +454,7 @@ static int ecryptfs_write_end(struct file *file, "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, - &folio->page, 0, to); + folio, 0, to); if (!rc) { rc = copied; fsstack_copy_inode_size(ecryptfs_inode, @@ -499,7 +476,7 @@ static int ecryptfs_write_end(struct file *file, "zeros in page with index = [0x%.16lx]\n", index); goto out; } - rc = ecryptfs_encrypt_page(&folio->page); + rc = ecryptfs_encrypt_page(folio); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " "index [0x%.16lx])\n", index); @@ -548,9 +525,10 @@ const struct address_space_operations ecryptfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, #endif - .writepage = ecryptfs_writepage, + .writepages = ecryptfs_writepages, .read_folio = ecryptfs_read_folio, .write_begin = ecryptfs_write_begin, .write_end = ecryptfs_write_end, + .migrate_folio = filemap_migrate_folio, .bmap = ecryptfs_bmap, }; diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 3458f153a588..b3b451c2b941 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -41,30 +41,29 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, /** * ecryptfs_write_lower_page_segment * @ecryptfs_inode: The eCryptfs inode - * @page_for_lower: The page containing the data to be written to the + * @folio_for_lower: The folio containing the data to be written to the * lower file - * @offset_in_page: The offset in the @page_for_lower from which to + * @offset_in_page: The offset in the @folio_for_lower from which to * start writing the data - * @size: The amount of data from @page_for_lower to write to the + * @size: The amount of data from @folio_for_lower to write to the * lower file * * Determines the byte offset in the file for the given page and * offset within the page, maps the page, and makes the call to write - * the contents of @page_for_lower to the lower inode. + * the contents of @folio_for_lower to the lower inode. * * Returns zero on success; non-zero otherwise */ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, - struct page *page_for_lower, + struct folio *folio_for_lower, size_t offset_in_page, size_t size) { char *virt; loff_t offset; int rc; - offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT) - + offset_in_page); - virt = kmap_local_page(page_for_lower); + offset = (loff_t)folio_for_lower->index * PAGE_SIZE + offset_in_page; + virt = kmap_local_folio(folio_for_lower, 0); rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); if (rc > 0) rc = 0; @@ -93,7 +92,6 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size) { - struct page *ecryptfs_page; struct ecryptfs_crypt_stat *crypt_stat; char *ecryptfs_page_virt; loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); @@ -111,6 +109,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, else pos = offset; while (pos < (offset + size)) { + struct folio *ecryptfs_folio; pgoff_t ecryptfs_page_idx = (pos >> PAGE_SHIFT); size_t start_offset_in_page = (pos & ~PAGE_MASK); size_t num_bytes = (PAGE_SIZE - start_offset_in_page); @@ -130,17 +129,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, if (num_bytes > total_remaining_zeros) num_bytes = total_remaining_zeros; } - ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, - ecryptfs_page_idx); - if (IS_ERR(ecryptfs_page)) { - rc = PTR_ERR(ecryptfs_page); + ecryptfs_folio = read_mapping_folio(ecryptfs_inode->i_mapping, + ecryptfs_page_idx, NULL); + if (IS_ERR(ecryptfs_folio)) { + rc = PTR_ERR(ecryptfs_folio); printk(KERN_ERR "%s: Error getting page at " "index [%ld] from eCryptfs inode " "mapping; rc = [%d]\n", __func__, ecryptfs_page_idx, rc); goto out; } - ecryptfs_page_virt = kmap_local_page(ecryptfs_page); + folio_lock(ecryptfs_folio); + ecryptfs_page_virt = kmap_local_folio(ecryptfs_folio, 0); /* * pos: where we're now writing, offset: where the request was @@ -164,17 +164,17 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, data_offset += num_bytes; } kunmap_local(ecryptfs_page_virt); - flush_dcache_page(ecryptfs_page); - SetPageUptodate(ecryptfs_page); - unlock_page(ecryptfs_page); + flush_dcache_folio(ecryptfs_folio); + folio_mark_uptodate(ecryptfs_folio); + folio_unlock(ecryptfs_folio); if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) - rc = ecryptfs_encrypt_page(ecryptfs_page); + rc = ecryptfs_encrypt_page(ecryptfs_folio); else rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, - ecryptfs_page, + ecryptfs_folio, start_offset_in_page, data_offset); - put_page(ecryptfs_page); + folio_put(ecryptfs_folio); if (rc) { printk(KERN_ERR "%s: Error encrypting " "page; rc = [%d]\n", __func__, rc); @@ -228,7 +228,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size, /** * ecryptfs_read_lower_page_segment - * @page_for_ecryptfs: The page into which data for eCryptfs will be + * @folio_for_ecryptfs: The folio into which data for eCryptfs will be * written * @page_index: Page index in @page_for_ecryptfs from which to start * writing @@ -243,7 +243,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size, * * Returns zero on success; non-zero otherwise */ -int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, +int ecryptfs_read_lower_page_segment(struct folio *folio_for_ecryptfs, pgoff_t page_index, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode) @@ -252,12 +252,12 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, loff_t offset; int rc; - offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page); - virt = kmap_local_page(page_for_ecryptfs); + offset = (loff_t)page_index * PAGE_SIZE + offset_in_page; + virt = kmap_local_folio(folio_for_ecryptfs, 0); rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); if (rc > 0) rc = 0; kunmap_local(virt); - flush_dcache_page(page_for_ecryptfs); + flush_dcache_folio(folio_for_ecryptfs); return rc; } diff --git a/fs/efs/super.c b/fs/efs/super.c index e4421c10caeb..c59086b7eabf 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -15,7 +15,6 @@ #include <linux/vfs.h> #include <linux/blkdev.h> #include <linux/fs_context.h> -#include <linux/fs_parser.h> #include "efs.h" #include <linux/efs_vh.h> #include <linux/efs_fs_sb.h> @@ -49,15 +48,6 @@ static struct pt_types sgi_pt_types[] = { {0, NULL} }; -enum { - Opt_explicit_open, -}; - -static const struct fs_parameter_spec efs_param_spec[] = { - fsparam_flag ("explicit-open", Opt_explicit_open), - {} -}; - /* * File system definition and registration. */ @@ -67,7 +57,6 @@ static struct file_system_type efs_fs_type = { .kill_sb = efs_kill_sb, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = efs_init_fs_context, - .parameters = efs_param_spec, }; MODULE_ALIAS_FS("efs"); @@ -265,7 +254,8 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc) if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); - return -EINVAL; + return invalf(fc, "device does not support %d byte blocks\n", + EFS_BLOCKSIZE); } /* read the vh (volume header) block */ @@ -327,43 +317,22 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc) return 0; } -static void efs_free_fc(struct fs_context *fc) -{ - kfree(fc->fs_private); -} - static int efs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, efs_fill_super); } -static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param) -{ - int token; - struct fs_parse_result result; - - token = fs_parse(fc, efs_param_spec, param, &result); - if (token < 0) - return token; - return 0; -} - static int efs_reconfigure(struct fs_context *fc) { sync_filesystem(fc->root->d_sb); + fc->sb_flags |= SB_RDONLY; return 0; } -struct efs_context { - unsigned long s_mount_opts; -}; - static const struct fs_context_operations efs_context_opts = { - .parse_param = efs_parse_param, .get_tree = efs_get_tree, .reconfigure = efs_reconfigure, - .free = efs_free_fc, }; /* @@ -371,12 +340,6 @@ static const struct fs_context_operations efs_context_opts = { */ static int efs_init_fs_context(struct fs_context *fc) { - struct efs_context *ctx; - - ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - fc->fs_private = ctx; fc->ops = &efs_context_opts; return 0; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 666873f745da..bed3dbe5b7cb 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -191,10 +191,14 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(file)) return PTR_ERR(file); - dif->file = file; - if (!erofs_is_fileio_mode(sbi)) + if (!erofs_is_fileio_mode(sbi)) { dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), &dif->dax_part_off, NULL, NULL); + } else if (!S_ISREG(file_inode(file)->i_mode)) { + fput(file); + return -EINVAL; + } + dif->file = file; } dif->blocks = le32_to_cpu(dis->blocks); @@ -705,7 +709,9 @@ static int erofs_fc_get_tree(struct fs_context *fc) if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); - ret = get_tree_bdev(fc, erofs_fc_fill_super); + ret = get_tree_bdev_flags(fc, erofs_fc_fill_super, + IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ? + GET_TREE_BDEV_QUIET_LOOKUP : 0); #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE if (ret == -ENOTBLK) { if (!fc->source) @@ -714,7 +720,10 @@ static int erofs_fc_get_tree(struct fs_context *fc) if (IS_ERR(sbi->fdev)) return PTR_ERR(sbi->fdev); - return get_tree_nodev(fc, erofs_fc_fill_super); + if (S_ISREG(file_inode(sbi->fdev)->i_mode) && + sbi->fdev->f_mapping->a_ops->read_folio) + return get_tree_nodev(fc, erofs_fc_fill_super); + fput(sbi->fdev); } #endif return ret; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 8936790618c6..a569ff9dfd04 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -710,24 +710,6 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, return ret; } -static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) -{ - struct z_erofs_pcluster *pcl = f->pcl; - z_erofs_next_pcluster_t *owned_head = &f->owned_head; - - /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, - *owned_head) == Z_EROFS_PCLUSTER_NIL) { - *owned_head = &pcl->next; - /* so we can attach this pcluster to our submission chain. */ - f->mode = Z_EROFS_PCLUSTER_FOLLOWED; - return; - } - - /* type 2, it belongs to an ongoing chain */ - f->mode = Z_EROFS_PCLUSTER_INFLIGHT; -} - static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; @@ -803,7 +785,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) int ret; DBG_BUGON(fe->pcl); - /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); @@ -823,7 +804,15 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); - z_erofs_try_to_claim_pcluster(fe); + /* check if this pcluster hasn't been linked into any chain. */ + if (cmpxchg(&fe->pcl->next, Z_EROFS_PCLUSTER_NIL, + fe->owned_head) == Z_EROFS_PCLUSTER_NIL) { + /* .. so it can be attached to our submission chain */ + fe->owned_head = &fe->pcl->next; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; + } else { /* otherwise, it belongs to an inflight chain */ + fe->mode = Z_EROFS_PCLUSTER_INFLIGHT; + } } else if (ret) { return ret; } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 1253a8456e59..a076cca1f547 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -10,8 +10,6 @@ struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; - void *kaddr; - unsigned long lcn; /* compression extent information gathered */ u8 type, headtype; @@ -33,14 +31,11 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, struct z_erofs_lcluster_index *di; unsigned int advise; - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - pos, EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); - - m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); + di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP); + if (IS_ERR(di)) + return PTR_ERR(di); m->lcn = lcn; - di = m->kaddr; + m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); advise = le16_to_cpu(di->di_advise); m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; @@ -53,8 +48,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, DBG_BUGON(1); return -EFSCORRUPTED; } - m->compressedblks = m->delta[0] & - ~Z_EROFS_LI_D0_CBLKCNT; + m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); @@ -110,9 +104,9 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; - int i; - u8 *in, type; bool big_pcluster; + u8 *in, type; + int i; if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; @@ -121,6 +115,10 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; + in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP); + if (IS_ERR(in)) + return PTR_ERR(in); + /* it doesn't equal to round_up(..) */ m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); @@ -128,9 +126,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; bytes = pos & ((vcnt << amortizedshift) - 1); - - in = m->kaddr - bytes; - + in -= bytes; i = bytes >> amortizedshift; lo = decode_compactedbits(lobits, in, encodebits * i, &type); @@ -255,10 +251,6 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, amortizedshift = 2; out: pos += lcn * (1 << amortizedshift); - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - pos, EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); } diff --git a/fs/eventfd.c b/fs/eventfd.c index 22c934f3a080..76129bfcd663 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -347,13 +347,10 @@ EXPORT_SYMBOL_GPL(eventfd_fget); */ struct eventfd_ctx *eventfd_ctx_fdget(int fd) { - struct eventfd_ctx *ctx; - struct fd f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return ERR_PTR(-EBADF); - ctx = eventfd_ctx_fileget(fd_file(f)); - fdput(f); - return ctx; + return eventfd_ctx_fileget(fd_file(f)); } EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1ae4542f0bd8..62433cb3d2c2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -823,7 +823,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { - file->f_ep = NULL; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); @@ -1002,7 +1003,7 @@ static struct file *epi_fget(const struct epitem *epi) struct file *file; file = epi->ffd.file; - if (!atomic_long_inc_not_zero(&file->f_count)) + if (!file_ref_get(&file->f_ref)) file = NULL; return file; } @@ -1372,7 +1373,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v break; } } - wake_up(&ep->wq); + if (sync) + wake_up_sync(&ep->wq); + else + wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1603,7 +1607,8 @@ allocate: spin_unlock(&file->f_lock); goto allocate; } - file->f_ep = head; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); @@ -2254,25 +2259,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, { int error; int full_check = 0; - struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; - error = -EBADF; - f = fdget(epfd); - if (!fd_file(f)) - goto error_return; + CLASS(fd, f)(epfd); + if (fd_empty(f)) + return -EBADF; /* Get the "struct file *" for the target file */ - tf = fdget(fd); - if (!fd_file(tf)) - goto error_fput; + CLASS(fd, tf)(fd); + if (fd_empty(tf)) + return -EBADF; /* The target file descriptor must support poll */ - error = -EPERM; if (!file_can_poll(fd_file(tf))) - goto error_tgt_fput; + return -EPERM; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) @@ -2391,12 +2393,6 @@ error_tgt_fput: loop_check_gen++; mutex_unlock(&epnested_mutex); } - - fdput(tf); -error_fput: - fdput(f); -error_return: - return error; } @@ -2424,8 +2420,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { - int error; - struct fd f; struct eventpoll *ep; /* The maximum number of event must be greater than zero */ @@ -2437,17 +2431,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, return -EFAULT; /* Get the "struct file *" for the eventpoll file */ - f = fdget(epfd); - if (!fd_file(f)) + CLASS(fd, f)(epfd); + if (fd_empty(f)) return -EBADF; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ - error = -EINVAL; if (!is_file_epoll(fd_file(f))) - goto error_fput; + return -EINVAL; /* * At this point it is safe to assume that the "private_data" contains @@ -2456,11 +2449,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, ep = fd_file(f)->private_data; /* Time to fish for events ... */ - error = ep_poll(ep, events, maxevents, to); - -error_fput: - fdput(f); - return error; + return ep_poll(ep, events, maxevents, to); } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 591fb3f710be..8042ad873808 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -550,7 +550,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | (ignore_locked ? REQ_RAHEAD : 0), - ext4_end_bitmap_read); + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO)); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); @@ -577,7 +578,6 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { ext4_error_err(sb, EIO, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ef6a3c8f3a9a..02d47a64e8d1 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -418,7 +418,7 @@ struct fname { __u32 inode; __u8 name_len; __u8 file_type; - char name[]; + char name[] __counted_by(name_len); }; /* @@ -471,14 +471,13 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; struct dir_private_info *info; - int len; info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ - len = sizeof(struct fname) + ent_name->len + 1; - new_fn = kzalloc(len, GFP_KERNEL); + new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1), + GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 44b0d418143c..74f2071189b2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1729,6 +1729,10 @@ struct ext4_sb_info { */ struct work_struct s_sb_upd_work; + /* Atomic write unit values in bytes */ + unsigned int s_awu_min; + unsigned int s_awu_max; + /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; @@ -1865,14 +1869,6 @@ static inline bool ext4_simulate_fail(struct super_block *sb, return false; } -static inline void ext4_simulate_fail_bh(struct super_block *sb, - struct buffer_head *bh, - unsigned long code) -{ - if (!IS_ERR(bh) && ext4_simulate_fail(sb, code)) - clear_buffer_uptodate(bh); -} - /* * Error number codes for s_{first,last}_error_errno * @@ -3100,9 +3096,9 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io); + bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io); + bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); @@ -3855,6 +3851,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) return buffer_uptodate(bh); } +static inline bool ext4_inode_can_atomic_write(struct inode *inode) +{ + + return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0; +} + extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 34e25eee6521..a07a98a4b97a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -568,7 +568,7 @@ __read_extent_tree_block(const char *function, unsigned int line, if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); - err = ext4_read_bh(bh, 0, NULL); + err = ext4_read_bh(bh, 0, NULL, false); if (err < 0) goto errout; } @@ -3138,7 +3138,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) return; ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, - EXTENT_STATUS_WRITTEN, 0); + EXTENT_STATUS_WRITTEN, false); } /* FIXME!! we need to try to merge to left or right after zero-out */ @@ -4158,7 +4158,7 @@ insert_hole: /* Put just found gap into cache to speed up subsequent requests */ ext_debug(inode, " -> %u:%u\n", hole_start, len); ext4_es_insert_extent(inode, hole_start, len, ~0, - EXTENT_STATUS_HOLE, 0); + EXTENT_STATUS_HOLE, false); /* Update hole_len to reflect hole size after lblk */ if (hole_start != lblk) @@ -4482,7 +4482,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int depth = 0; struct ext4_map_blocks map; unsigned int credits; - loff_t epos; + loff_t epos, old_size = i_size_read(inode); BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; @@ -4541,6 +4541,11 @@ retry: if (ext4_update_inode_size(inode, epos) & 0x1) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); + if (epos > old_size) { + pagecache_isize_extended(inode, old_size, epos); + ext4_zero_partial_blocks(handle, inode, + old_size, epos - old_size); + } } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c786691dabd3..ae29832aab1e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -848,7 +848,7 @@ out: */ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status, int flags) + unsigned int status, bool delalloc_reserve_used) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; @@ -863,8 +863,8 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/%u) %llu %x %x to extent status tree of inode %lu\n", - lblk, len, pblk, status, flags, inode->i_ino); + es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n", + lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino); if (!len) return; @@ -945,7 +945,7 @@ error: resv_used += pending; if (resv_used) ext4_da_update_reserve_space(inode, resv_used, - flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + delalloc_reserve_used); if (err1 || err2 || err3 < 0) goto retry; diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 4424232de298..8f9c008d11e8 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -135,7 +135,8 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status, int flags); + unsigned int status, + bool delalloc_reserve_used); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index b33664f6ce2a..26c4fc37edcf 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -291,9 +291,9 @@ void ext4_fc_del(struct inode *inode) return; restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); + spin_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + spin_unlock(&sbi->s_fc_lock); return; } @@ -357,9 +357,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl } spin_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - if (has_transaction && - (!is_ineligible || - (is_ineligible && tid_gt(tid, sbi->s_fc_ineligible_tid)))) + if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); spin_unlock(&sbi->s_fc_lock); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f14aed14b9cf..3bd96c3d4cd0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -392,8 +392,9 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, */ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && pos + size <= i_size_read(inode)) - return size; - return ext4_handle_inode_extension(inode, pos, size, size); + return 0; + error = ext4_handle_inode_extension(inode, pos, size, size); + return error < 0 ? error : 0; } static const struct iomap_dio_ops ext4_dio_write_ops = { @@ -564,12 +565,9 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - ext4_journal_stop(handle); + if (ret) + goto out; } if (ilock_shared && !unwritten) @@ -599,6 +597,13 @@ out: ssize_t err; loff_t endbyte; + /* + * There is no support for atomic writes on buffered-io yet, + * we should never fallback to buffered-io for DIO atomic + * writes. + */ + WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); + offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) @@ -692,6 +697,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif + + if (iocb->ki_flags & IOCB_ATOMIC) { + size_t len = iov_iter_count(from); + int ret; + + if (len < EXT4_SB(inode->i_sb)->s_awu_min || + len > EXT4_SB(inode->i_sb)->s_awu_max) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else @@ -884,6 +903,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } + if (ext4_inode_can_atomic_write(inode)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index df853c4d3a8c..383c6edea6dd 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -185,6 +185,56 @@ static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) return fmr->fmr_physical + fmr->fmr_length; } +static int ext4_getfsmap_meta_helper(struct super_block *sb, + ext4_group_t agno, ext4_grpblk_t start, + ext4_grpblk_t len, void *priv) +{ + struct ext4_getfsmap_info *info = priv; + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb, fs_start, fs_end; + int error; + + fs_start = fsb = (EXT4_C2B(sbi, start) + + ext4_group_first_block_no(sb, agno)); + fs_end = fs_start + EXT4_C2B(sbi, len); + + /* Return relevant extents from the meta_list */ + list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { + if (p->fmr_physical < info->gfi_next_fsblk) { + list_del(&p->fmr_list); + kfree(p); + continue; + } + if (p->fmr_physical <= fs_start || + p->fmr_physical + p->fmr_length <= fs_end) { + /* Emit the retained free extent record if present */ + if (info->gfi_lastfree.fmr_owner) { + error = ext4_getfsmap_helper(sb, info, + &info->gfi_lastfree); + if (error) + return error; + info->gfi_lastfree.fmr_owner = 0; + } + error = ext4_getfsmap_helper(sb, info, p); + if (error) + return error; + fsb = p->fmr_physical + p->fmr_length; + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + list_del(&p->fmr_list); + kfree(p); + continue; + } + } + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + + return 0; +} + + /* Transform a blockgroup's free record into a fsmap */ static int ext4_getfsmap_datadev_helper(struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, @@ -539,6 +589,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, error = ext4_mballoc_query_range(sb, info->gfi_agno, EXT4_B2C(sbi, info->gfi_low.fmr_physical), EXT4_B2C(sbi, info->gfi_high.fmr_physical), + ext4_getfsmap_meta_helper, ext4_getfsmap_datadev_helper, info); if (error) goto err; @@ -560,7 +611,8 @@ static int ext4_getfsmap_datadev(struct super_block *sb, /* Report any gaps at the end of the bg */ info->gfi_last = true; - error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info); + error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1, + 0, info); if (error) goto err; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 7f1a5f90dbbd..21d228073d79 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -193,8 +193,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); - ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); + ext4_read_bh(bh, REQ_META | REQ_PRIO, + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO)); if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error_err(sb, EIO, "Cannot read inode bitmap - " diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 7404f0935c90..7de327fa7b1c 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -170,7 +170,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, } if (!bh_uptodate_or_lock(bh)) { - if (ext4_read_bh(bh, 0, NULL) < 0) { + if (ext4_read_bh(bh, 0, NULL, false) < 0) { put_bh(bh); goto failure; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 54bdd4884fe6..89aade6f45f6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -483,7 +483,7 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, 0); + map->m_pblk, status, false); return retval; } @@ -563,8 +563,8 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, flags); + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, + status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); return retval; } @@ -856,7 +856,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); - bh = sb_getblk(inode->i_sb, map.m_pblk); + /* + * Since bh could introduce extra ref count such as referred by + * journal_head etc. Try to avoid using __GFP_MOVABLE here + * as it may fail the migration when journal_head remains. + */ + bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, + inode->i_sb->s_blocksize); + if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { @@ -1307,8 +1314,10 @@ static int ext4_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock @@ -1423,8 +1432,10 @@ static int ext4_journalled_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); @@ -2985,7 +2996,8 @@ static int ext4_da_do_write_end(struct address_space *mapping, struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; - loff_t new_i_size; + loff_t new_i_size, zero_len = 0; + handle_t *handle; if (unlikely(!folio_buffers(folio))) { folio_unlock(folio); @@ -3029,18 +3041,21 @@ static int ext4_da_do_write_end(struct address_space *mapping, folio_unlock(folio); folio_put(folio); - if (old_size < pos) + if (pos > old_size) { pagecache_isize_extended(inode, old_size, pos); + zero_len = pos - old_size; + } - if (disksize_changed) { - handle_t *handle; + if (!disksize_changed && !zero_len) + return copied; - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (zero_len) + ext4_zero_partial_blocks(handle, inode, old_size, zero_len); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); return copied; } @@ -3444,17 +3459,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, return ret; } +static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) +{ + /* must be a directio to fall back to buffered */ + if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != + (IOMAP_WRITE | IOMAP_DIRECT)) + return false; + + /* atomic writes are all-or-nothing */ + if (flags & IOMAP_ATOMIC) + return false; + + /* can only try again if we wrote nothing */ + return written == 0; +} + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to - * the allocated blocks. If so, return the magic error code so that we - * fallback to buffered I/O and attempt to complete the remainder of - * the I/O. Any blocks that may have been allocated in preparation for - * the direct I/O will be reused during buffered I/O. + * the allocated blocks. If so, return the magic error code for + * non-atomic write so that we fallback to buffered I/O and attempt to + * complete the remainder of the I/O. + * For non-atomic writes, any blocks that may have been + * allocated in preparation for the direct I/O will be reused during + * buffered I/O. For atomic write, we never fallback to buffered-io. */ - if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + if (ext4_want_directio_fallback(flags, written)) return -ENOTBLK; return 0; @@ -4497,10 +4529,10 @@ make_io: * Read the block from disk. */ trace_ext4_load_inode(sb, ino); - ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, + ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); blk_finish_plug(&plug); wait_on_buffer(bh); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; @@ -5426,6 +5458,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (attr->ia_size != inode->i_size) { + /* attach jbd2 jinode for EOF folio tail zeroing */ + if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || + oldsize & (inode->i_sb->s_blocksize - 1)) { + error = ext4_inode_attach_jinode(inode); + if (error) + goto err_out; + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -5436,12 +5476,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, orphan = 1; } /* - * Update c/mtime on truncate up, ext4_truncate() will - * update c/mtime in shrink case below + * Update c/mtime and tail zero the EOF folio on + * truncate up. ext4_truncate() handles the shrink case + * below. */ - if (!shrink) + if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + if (oldsize & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, + inode->i_mapping, oldsize); + } if (shrink) ext4_fc_track_range(handle, inode, @@ -5578,6 +5623,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, } } + if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int awu_min = 0, awu_max = 0; + + if (ext4_inode_can_atomic_write(inode)) { + awu_min = sbi->s_awu_min; + awu_max = sbi->s_awu_max; + } + + generic_fill_statx_atomic_writes(stat, awu_min, awu_max); + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 1c77400bd88e..7b9ce71c1c81 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1330,7 +1330,6 @@ group_extend_out: case EXT4_IOC_MOVE_EXT: { struct move_extent me; - struct fd donor; int err; if (!(filp->f_mode & FMODE_READ) || @@ -1342,30 +1341,26 @@ group_extend_out: return -EFAULT; me.moved_len = 0; - donor = fdget(me.donor_fd); - if (!fd_file(donor)) + CLASS(fd, donor)(me.donor_fd); + if (fd_empty(donor)) return -EBADF; - if (!(fd_file(donor)->f_mode & FMODE_WRITE)) { - err = -EBADF; - goto mext_out; - } + if (!(fd_file(donor)->f_mode & FMODE_WRITE)) + return -EBADF; if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with bigalloc"); - err = -EOPNOTSUPP; - goto mext_out; + return -EOPNOTSUPP; } else if (IS_DAX(inode)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with DAX"); - err = -EOPNOTSUPP; - goto mext_out; + return -EOPNOTSUPP; } err = mnt_want_write_file(filp); if (err) - goto mext_out; + return err; err = ext4_move_extents(filp, fd_file(donor), me.orig_start, me.donor_start, me.len, &me.moved_len); @@ -1374,8 +1369,6 @@ group_extend_out: if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) err = -EFAULT; -mext_out: - fdput(donor); return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d73e38323879..b25a27c86696 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5711,7 +5711,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); - mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no"); + mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa)); if (ac->ac_pa) mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? "group pa" : "inode pa"); @@ -6056,7 +6056,7 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, } out_dbg: - mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); + mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret)); return ret; } @@ -6999,13 +6999,14 @@ int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t group, - ext4_grpblk_t start, + ext4_grpblk_t first, ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv) { void *bitmap; - ext4_grpblk_t next; + ext4_grpblk_t start, next; struct ext4_buddy e4b; int error; @@ -7016,10 +7017,19 @@ ext4_mballoc_query_range( ext4_lock_group(sb, group); - start = max(e4b.bd_info->bb_first_free, start); + start = max(e4b.bd_info->bb_first_free, first); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - + if (meta_formatter && start != first) { + if (start > end) + start = end; + ext4_unlock_group(sb, group); + error = meta_formatter(sb, group, first, start - first, + priv); + if (error) + goto out_unload; + ext4_lock_group(sb, group); + } while (start <= end) { start = mb_find_next_zero_bit(bitmap, end + 1, start); if (start > end) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index d8553f1498d3..f8280de3e882 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -259,6 +259,7 @@ ext4_mballoc_query_range( ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index bd946d0c71b7..d64c04ed061a 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -94,7 +94,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, } lock_buffer(*bh); - ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL); + ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false); if (ret) goto warn_exit; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b64661ea6e0e..898443e98efc 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -213,7 +213,7 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) unlock_buffer(bh); continue; } - ext4_read_bh_nowait(bh, 0, NULL); + ext4_read_bh_nowait(bh, 0, NULL, false); nr++; } while (block++, (bh = bh->b_this_page) != head); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 790db7eac6c2..bcf2737078b8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1747,7 +1747,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, #endif frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) - return (struct buffer_head *) frame; + return ERR_CAST(frame); do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); @@ -1952,7 +1952,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; - return (struct ext4_dir_entry_2 *) bh2; + return ERR_CAST(bh2); } BUFFER_TRACE(*bh, "get_write_access"); @@ -2000,8 +2000,17 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, else split = count/2; + if (WARN_ON_ONCE(split == 0)) { + /* Should never happen, but avoid out-of-bounds access below */ + ext4_error_inode_block(dir, (*bh)->b_blocknr, 0, + "bad indexed directory? hash=%08x:%08x count=%d move=%u", + hinfo->hash, hinfo->minor_hash, count, move); + err = -EFSCORRUPTED; + goto out; + } + hash2 = map[split].hash; - continued = split > 0 ? hash2 == map[split - 1].hash : 0; + continued = hash2 == map[split - 1].hash; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", (unsigned long)dx_get_block(frame->at), hash2, split, count-split)); @@ -2043,10 +2052,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, return de; journal_error: + ext4_std_error(dir->i_sb, err); +out: brelse(*bh); brelse(bh2); *bh = NULL; - ext4_std_error(dir->i_sb, err); return ERR_PTR(err); } @@ -2395,11 +2405,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; -#if IS_ENABLED(CONFIG_UNICODE) - if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && - utf8_validate(sb->s_encoding, &dentry->d_name)) + if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; -#endif retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); if (retval) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ad5543866d21..69b8a7221a2b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -417,11 +417,13 @@ static void io_submit_add_bh(struct ext4_io_submit *io, submit_and_retry: ext4_io_submit(io); } - if (io->io_bio == NULL) + if (io->io_bio == NULL) { io_submit_init_bio(io, bh); + io->io_bio->bi_write_hint = inode->i_write_hint; + } if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size); io->io_next_block++; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a2704f064361..72f77f78ae8d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1300,7 +1300,7 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) if (unlikely(!bh)) return NULL; if (!bh_uptodate_or_lock(bh)) { - if (ext4_read_bh(bh, 0, NULL) < 0) { + if (ext4_read_bh(bh, 0, NULL, false) < 0) { brelse(bh); return NULL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 16a4ce704460..785809f33ff4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -161,8 +161,14 @@ MODULE_ALIAS("ext3"); static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io) + bh_end_io_t *end_io, bool simu_fail) { + if (simu_fail) { + clear_buffer_uptodate(bh); + unlock_buffer(bh); + return; + } + /* * buffer's verified bit is no longer valid after reading from * disk again due to write out error, clear it to make sure we @@ -176,7 +182,7 @@ static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, } void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io) + bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); @@ -184,10 +190,11 @@ void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, unlock_buffer(bh); return; } - __ext4_read_bh(bh, op_flags, end_io); + __ext4_read_bh(bh, op_flags, end_io, simu_fail); } -int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) +int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); @@ -196,7 +203,7 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io return 0; } - __ext4_read_bh(bh, op_flags, end_io); + __ext4_read_bh(bh, op_flags, end_io, simu_fail); wait_on_buffer(bh); if (buffer_uptodate(bh)) @@ -208,10 +215,10 @@ int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { lock_buffer(bh); if (!wait) { - ext4_read_bh_nowait(bh, op_flags, NULL); + ext4_read_bh_nowait(bh, op_flags, NULL, false); return 0; } - return ext4_read_bh(bh, op_flags, NULL); + return ext4_read_bh(bh, op_flags, NULL, false); } /* @@ -266,7 +273,7 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) if (likely(bh)) { if (trylock_buffer(bh)) - ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); + ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false); brelse(bh); } } @@ -346,9 +353,9 @@ __u32 ext4_free_group_clusters(struct super_block *sb, __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg) { - return le16_to_cpu(bg->bg_free_inodes_count_lo) | + return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); + (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, @@ -402,9 +409,9 @@ void ext4_free_group_clusters_set(struct super_block *sb, void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { - bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); + WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count)); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) - bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); + WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16)); } void ext4_used_dirs_set(struct super_block *sb, @@ -2096,16 +2103,16 @@ static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, } #define EXT4_SET_CTX(name) \ -static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ - unsigned long flag) \ +static inline __maybe_unused \ +void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name |= flag; \ } #define EXT4_CLEAR_CTX(name) \ -static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \ - unsigned long flag) \ +static inline __maybe_unused \ +void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name &= ~flag; \ @@ -3030,6 +3037,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PUTS("mb_optimize_scan=1"); } + if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) + SEQ_OPTS_PUTS("prefetch_block_bitmaps"); + ext4_show_quota_options(seq, sb); return 0; } @@ -3709,12 +3719,12 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ret = 1; if (!ret) { - start_time = ktime_get_real_ns(); + start_time = ktime_get_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { - elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * + elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) * EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; @@ -3774,8 +3784,9 @@ static int ext4_lazyinit_thread(void *arg) cont_thread: while (true) { - next_wakeup = MAX_JIFFY_OFFSET; + bool next_wakeup_initialized = false; + next_wakeup = 0; mutex_lock(&eli->li_list_mtx); if (list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); @@ -3788,8 +3799,11 @@ cont_thread: lr_request); if (time_before(jiffies, elr->lr_next_sched)) { - if (time_before(elr->lr_next_sched, next_wakeup)) + if (!next_wakeup_initialized || + time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; + next_wakeup_initialized = true; + } continue; } if (down_read_trylock(&elr->lr_super->s_umount)) { @@ -3817,16 +3831,18 @@ cont_thread: elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } - if (time_before(elr->lr_next_sched, next_wakeup)) + if (!next_wakeup_initialized || + time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; + next_wakeup_initialized = true; + } } mutex_unlock(&eli->li_list_mtx); try_to_freeze(); cur = jiffies; - if ((time_after_eq(cur, next_wakeup)) || - (MAX_JIFFY_OFFSET == next_wakeup)) { + if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) { cond_resched(); continue; } @@ -4425,6 +4441,36 @@ static int ext4_handle_clustersize(struct super_block *sb) return 0; } +/* + * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * @sb: super block + * TODO: Later add support for bigalloc + */ +static void ext4_atomic_write_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct block_device *bdev = sb->s_bdev; + + if (!bdev_can_atomic_write(bdev)) + return; + + if (!ext4_has_feature_extents(sb)) + return; + + sbi->s_awu_min = max(sb->s_blocksize, + bdev_atomic_write_unit_min_bytes(bdev)); + sbi->s_awu_max = min(sb->s_blocksize, + bdev_atomic_write_unit_max_bytes(bdev)); + if (sbi->s_awu_min && sbi->s_awu_max && + sbi->s_awu_min <= sbi->s_awu_max) { + ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u", + sbi->s_awu_min, sbi->s_awu_max); + } else { + sbi->s_awu_min = 0; + sbi->s_awu_max = 0; + } +} + static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -5336,6 +5382,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) spin_lock_init(&sbi->s_bdev_wb_lock); + ext4_atomic_write_init(sb); ext4_fast_commit_init(sb); sb->s_root = NULL; @@ -6301,7 +6348,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) struct ext4_sb_info *sbi = EXT4_SB(sb); if (unlikely(ext4_forced_shutdown(sb))) - return 0; + return -EIO; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); @@ -6518,8 +6565,12 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } - if (test_opt2(sb, ABORT)) - ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); + if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) && + !test_opt(sb, DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount"); + err = -EINVAL; + goto restore_opts; + } sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); @@ -6689,6 +6740,14 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); + /* + * Handle aborting the filesystem as the last thing during remount to + * avoid obsure errors during remount when some option changes fail to + * apply due to shutdown filesystem. + */ + if (test_opt2(sb, ABORT)) + ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); + return 0; restore_opts: @@ -7329,7 +7388,7 @@ static struct file_system_type ext4_fs_type = { .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("ext4"); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 94f7b084f601..e3ce763cce18 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc && !is_read_io(fio->op)) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page) : WB_DATA_TYPE(fio->page, false)); @@ -911,7 +912,8 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); @@ -1011,7 +1013,8 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9ae54c4c72fe..84447d5145aa 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3038,32 +3038,27 @@ out: static int __f2fs_ioc_move_range(struct file *filp, struct f2fs_move_range *range) { - struct fd dst; int err; if (!(filp->f_mode & FMODE_READ) || !(filp->f_mode & FMODE_WRITE)) return -EBADF; - dst = fdget(range->dst_fd); - if (!fd_file(dst)) + CLASS(fd, dst)(range->dst_fd); + if (fd_empty(dst)) return -EBADF; - if (!(fd_file(dst)->f_mode & FMODE_WRITE)) { - err = -EBADF; - goto err_out; - } + if (!(fd_file(dst)->f_mode & FMODE_WRITE)) + return -EBADF; err = mnt_want_write_file(filp); if (err) - goto err_out; + return err; err = f2fs_move_file_range(filp, range->pos_in, fd_file(dst), range->pos_out, range->len); mnt_drop_write_file(filp); -err_out: - fdput(dst); return err; } @@ -4647,7 +4642,8 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) iov_iter_count(to), READ); /* In LFS mode, if there is inflight dio, wait for its completion */ - if (f2fs_lfs_mode(F2FS_I_SB(inode))) + if (f2fs_lfs_mode(F2FS_I_SB(inode)) && + get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE)) inode_dio_wait(inode); if (f2fs_should_use_dio(inode, iocb, to)) { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 6423e1dedf14..15bf32c21ac0 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1037,7 +1037,7 @@ error_inode: if (corrupt < 0) { fat_fs_error(new_dir->i_sb, "%s: Filesystem corrupted (i_pos %lld)", - __func__, sinfo.i_pos); + __func__, new_i_pos); } goto out; } diff --git a/fs/fcntl.c b/fs/fcntl.c index 22dd9dcce7ec..ac77dd912412 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -12,7 +12,6 @@ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/capability.h> #include <linux/dnotify.h> #include <linux/slab.h> @@ -397,6 +396,9 @@ static long f_dupfd_query(int fd, struct file *filp) { CLASS(fd_raw, f)(fd); + if (fd_empty(f)) + return -EBADF; + /* * We can do the 'fdput()' immediately, as the only thing that * matters is the pointer value which isn't changed by the fdput. @@ -570,24 +572,21 @@ static int check_fcntl_cmd(unsigned cmd) SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { - struct fd f = fdget_raw(fd); - long err = -EBADF; + CLASS(fd_raw, f)(fd); + long err; - if (!fd_file(f)) - goto out; + if (fd_empty(f)) + return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) - goto out1; + return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (!err) err = do_fcntl(fd, cmd, arg, fd_file(f)); -out1: - fdput(f); -out: return err; } @@ -596,21 +595,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { void __user *argp = (void __user *)arg; - struct fd f = fdget_raw(fd); + CLASS(fd_raw, f)(fd); struct flock64 flock; - long err = -EBADF; + long err; - if (!fd_file(f)) - goto out; + if (fd_empty(f)) + return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) - goto out1; + return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) - goto out1; + return err; switch (cmd) { case F_GETLK64: @@ -635,9 +634,6 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } -out1: - fdput(f); -out: return err; } #endif @@ -733,21 +729,21 @@ static int fixup_compat_flock(struct flock *flock) static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, compat_ulong_t arg) { - struct fd f = fdget_raw(fd); + CLASS(fd_raw, f)(fd); struct flock flock; - long err = -EBADF; + long err; - if (!fd_file(f)) - return err; + if (fd_empty(f)) + return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) - goto out_put; + return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) - goto out_put; + return err; switch (cmd) { case F_GETLK: @@ -790,8 +786,6 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } -out_put: - fdput(f); return err; } diff --git a/fs/fhandle.c b/fs/fhandle.c index 82df28d45cd7..5f801139358e 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -139,12 +139,11 @@ static int get_path_from_fd(int fd, struct path *root) path_get(root); spin_unlock(&fs->lock); } else { - struct fd f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return -EBADF; *root = fd_file(f)->f_path; path_get(root); - fdput(f); } return 0; diff --git a/fs/file.c b/fs/file.c index eb093e736972..fb1011cf6b4a 100644 --- a/fs/file.c +++ b/fs/file.c @@ -20,10 +20,73 @@ #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> +#include <linux/file_ref.h> #include <net/sock.h> #include "internal.h" +/** + * __file_ref_put - Slowpath of file_ref_put() + * @ref: Pointer to the reference count + * @cnt: Current reference count + * + * Invoked when the reference count is outside of the valid zone. + * + * Return: + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely schedule the + * object, which is protected by the reference counter, for + * deconstruction. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * deconstruct the protected object. + */ +bool __file_ref_put(file_ref_t *ref, unsigned long cnt) +{ + /* Did this drop the last reference? */ + if (likely(cnt == FILE_REF_NOREF)) { + /* + * Carefully try to set the reference count to FILE_REF_DEAD. + * + * This can fail if a concurrent get() operation has + * elevated it again or the corresponding put() even marked + * it dead already. Both are valid situations and do not + * require a retry. If this fails the caller is not + * allowed to deconstruct the object. + */ + if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) + return false; + + /* + * The caller can safely schedule the object for + * deconstruction. Provide acquire ordering. + */ + smp_acquire__after_ctrl_dep(); + return true; + } + + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; +} +EXPORT_SYMBOL_GPL(__file_ref_put); + unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ @@ -89,18 +152,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". - * - * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied - * by that "1024/sizeof(ptr)" before, we already know there are sufficient - * clear low bits. Clang seems to realize that, gcc ends up being confused. - * - * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, - * let's consider it documentation (and maybe a test-case for gcc to improve - * its code generation ;) */ -static struct fdtable * alloc_fdtable(unsigned int nr) +static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; + unsigned int nr; void *data; /* @@ -108,22 +164,32 @@ static struct fdtable * alloc_fdtable(unsigned int nr) * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B - * and growing in powers of two from there on. + * and growing in powers of two from there on. Since we called only + * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab + * already gives BITS_PER_LONG slots), the above boils down to + * 1. use the smallest power of two large enough to give us that many + * slots. + * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is + * 256 slots (i.e. 1Kb fd array). + * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there + * and we are never going to be asked for 64 or less. */ - nr /= (1024 / sizeof(struct file *)); - nr = roundup_pow_of_two(nr + 1); - nr *= (1024 / sizeof(struct file *)); - nr = ALIGN(nr, BITS_PER_LONG); + if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) + nr = 256; + else + nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open - * had been set lower between the check in expand_files() and here. Deal - * with that in caller, it's cheaper that way. + * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ - if (unlikely(nr > sysctl_nr_open)) - nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; + if (unlikely(nr > sysctl_nr_open)) { + nr = round_down(sysctl_nr_open, BITS_PER_LONG); + if (nr < slots_wanted) + return ERR_PTR(-EMFILE); + } fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) @@ -152,14 +218,14 @@ out_arr: out_fdt: kfree(fdt); out: - return NULL; + return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. - * Return <0 error code on error; 1 on successful completion. + * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) @@ -169,7 +235,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); - new_fdt = alloc_fdtable(nr); + new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. @@ -178,16 +244,8 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) synchronize_rcu(); spin_lock(&files->file_lock); - if (!new_fdt) - return -ENOMEM; - /* - * extremely unlikely race - sysctl_nr_open decreased between the check in - * caller and alloc_fdtable(). Cheaper to catch it here... - */ - if (unlikely(new_fdt->max_fds <= nr)) { - __free_fdtable(new_fdt); - return -EMFILE; - } + if (IS_ERR(new_fdt)) + return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); @@ -196,15 +254,14 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); - return 1; + return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. - * Return <0 error code on error; 0 when nothing done; 1 when files were - * expanded and execution may have blocked. + * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) @@ -212,14 +269,14 @@ static int expand_files(struct files_struct *files, unsigned int nr) __acquires(files->file_lock) { struct fdtable *fdt; - int expanded = 0; + int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return expanded; + return 0; /* Can we expand? */ if (nr >= sysctl_nr_open) @@ -227,7 +284,6 @@ repeat: if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); - expanded = 1; wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; @@ -235,27 +291,28 @@ repeat: /* All good, so we try */ files->resize_in_progress = true; - expanded = expand_fdtable(files, nr); + error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); - return expanded; -} - -static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) -{ - __set_bit(fd, fdt->close_on_exec); + return error; } -static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) +static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, + bool set) { - if (test_bit(fd, fdt->close_on_exec)) - __clear_bit(fd, fdt->close_on_exec); + if (set) { + __set_bit(fd, fdt->close_on_exec); + } else { + if (test_bit(fd, fdt->close_on_exec)) + __clear_bit(fd, fdt->close_on_exec); + } } -static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) +static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); + __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); @@ -264,7 +321,9 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); - __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); + fd /= BITS_PER_LONG; + if (test_bit(fd, fdt->full_fds_bits)) + __clear_bit(fd, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) @@ -306,7 +365,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; - int error; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) @@ -338,17 +396,10 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); - new_fdt = alloc_fdtable(open_files - 1); - if (!new_fdt) { - error = -ENOMEM; - goto out_release; - } - - /* beyond sysctl_nr_open; nothing to do */ - if (unlikely(new_fdt->max_fds < open_files)) { - __free_fdtable(new_fdt); - error = -EMFILE; - goto out_release; + new_fdt = alloc_fdtable(open_files); + if (IS_ERR(new_fdt)) { + kmem_cache_free(files_cachep, newf); + return ERR_CAST(new_fdt); } /* @@ -389,10 +440,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho rcu_assign_pointer(newf->fdt, new_fdt); return newf; - -out_release: - kmem_cache_free(files_cachep, newf); - return ERR_PTR(error); } static struct fdtable *close_files(struct files_struct * files) @@ -413,7 +460,7 @@ static struct fdtable *close_files(struct files_struct * files) set = fdt->open_fds[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); + struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); @@ -470,6 +517,15 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; + unsigned int bit; + + /* + * Try to avoid looking at the second level bitmap + */ + bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, + start & (BITS_PER_LONG - 1)); + if (bit < BITS_PER_LONG) + return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) @@ -496,7 +552,7 @@ repeat: if (fd < files->next_fd) fd = files->next_fd; - if (fd < fdt->max_fds) + if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* @@ -504,36 +560,22 @@ repeat: * will limit the total number of files that can be opened. */ error = -EMFILE; - if (fd >= end) + if (unlikely(fd >= end)) goto out; - error = expand_files(files, fd); - if (error < 0) - goto out; + if (unlikely(fd >= fdt->max_fds)) { + error = expand_files(files, fd); + if (error < 0) + goto out; - /* - * If we needed to expand the fs array we - * might have blocked - try again. - */ - if (error) goto repeat; + } if (start <= files->next_fd) files->next_fd = fd + 1; - __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; -#if 1 - /* Sanity check */ - if (rcu_access_pointer(fdt->fd[fd]) != NULL) { - printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); - rcu_assign_pointer(fdt->fd[fd], NULL); - } -#endif out: spin_unlock(&files->file_lock); @@ -599,7 +641,7 @@ void fd_install(unsigned int fd, struct file *file) rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); - BUG_ON(fdt->fd[fd] != NULL); + WARN_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; @@ -713,7 +755,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, } /** - * __close_range() - Close all file descriptors in a given range. + * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close @@ -721,8 +763,10 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. + * Currently, errors to close a given file descriptor are ignored. */ -int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) +SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, + unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; @@ -839,7 +883,7 @@ static struct file *__get_file_rcu(struct file __rcu **f) if (!file) return NULL; - if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + if (unlikely(!file_ref_get(&file->f_ref))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); @@ -853,8 +897,8 @@ static struct file *__get_file_rcu(struct file __rcu **f) OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* - * atomic_long_inc_not_zero() above provided a full memory - * barrier when we acquired a reference. + * file_ref_get() above provided a full memory barrier when we + * acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still @@ -952,11 +996,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * We need to confirm it by incrementing the refcount * and then check the lookup again. * - * atomic_long_inc_not_zero() gives us a full memory - * barrier. We only really need an 'acquire' one to - * protect the loads below, but we don't have that. + * file_ref_get() gives us a full memory barrier. We + * only really need an 'acquire' one to protect the + * loads below, but we don't have that. */ - if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + if (unlikely(!file_ref_get(&file->f_ref))) continue; /* @@ -1037,29 +1081,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) return file; } -struct file *lookup_fdget_rcu(unsigned int fd) -{ - return __fget_files_rcu(current->files, fd, 0); - -} -EXPORT_SYMBOL_GPL(lookup_fdget_rcu); - -struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd) -{ - /* Must be called with rcu_read_lock held */ - struct files_struct *files; - struct file *file = NULL; - - task_lock(task); - files = task->files; - if (files) - file = __fget_files_rcu(files, fd, 0); - task_unlock(task); - - return file; -} - -struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd) +struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -1069,17 +1091,19 @@ struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int * task_lock(task); files = task->files; if (files) { + rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } + rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } -EXPORT_SYMBOL(task_lookup_next_fdget_rcu); +EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. @@ -1096,6 +1120,13 @@ EXPORT_SYMBOL(task_lookup_next_fdget_rcu); * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. + * + * (As an exception to rule 2, you can call filp_close between fget_light and + * fput_light provided that you capture a real refcount with get_file before + * the call to filp_close, and ensure that this real refcount is fput *after* + * the fput_light call.) + * + * See also the documentation in rust/kernel/file.rs. */ static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { @@ -1176,13 +1207,8 @@ void __f_unlock_pos(struct file *f) void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; - struct fdtable *fdt; spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (flag) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } @@ -1223,11 +1249,7 @@ __releases(&files->file_lock) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); - __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) diff --git a/fs/file_table.c b/fs/file_table.c index eed5ffad9997..976736be47cb 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -9,7 +9,6 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> @@ -40,13 +39,17 @@ static struct files_stat_struct files_stat = { /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __ro_after_init; +static struct kmem_cache *bfilp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; /* Container for backing file with optional user path */ struct backing_file { struct file file; - struct path user_path; + union { + struct path user_path; + freeptr_t bf_freeptr; + }; }; static inline struct backing_file *backing_file(struct file *f) @@ -68,7 +71,7 @@ static inline void file_free(struct file *f) put_cred(f->f_cred); if (unlikely(f->f_mode & FMODE_BACKING)) { path_put(backing_file_user_path(f)); - kfree(backing_file(f)); + kmem_cache_free(bfilp_cachep, backing_file(f)); } else { kmem_cache_free(filp_cachep, f); } @@ -165,16 +168,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); - f->f_flags = flags; - f->f_mode = OPEN_FMODE(flags); - /* f->f_version: 0 */ + memset(&f->f_path, 0, sizeof(f->f_path)); + memset(&f->f_ra, 0, sizeof(f->f_ra)); + + f->f_flags = flags; + f->f_mode = OPEN_FMODE(flags); + + f->f_op = NULL; + f->f_mapping = NULL; + f->private_data = NULL; + f->f_inode = NULL; + f->f_owner = NULL; +#ifdef CONFIG_EPOLL + f->f_ep = NULL; +#endif + + f->f_iocb_flags = 0; + f->f_pos = 0; + f->f_wb_err = 0; + f->f_sb_err = 0; /* * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ - atomic_long_set(&f->f_count, 1); + file_ref_init(&f->f_ref, 1); return 0; } @@ -206,7 +225,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) goto over; } - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); @@ -240,7 +259,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) struct file *f; int error; - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); @@ -267,13 +286,13 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) struct backing_file *ff; int error; - ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); + ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL); if (unlikely(!ff)) return ERR_PTR(-ENOMEM); error = init_file(&ff->file, flags, cred); if (unlikely(error)) { - kfree(ff); + kmem_cache_free(bfilp_cachep, ff); return ERR_PTR(error); } @@ -479,7 +498,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); void fput(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { + if (file_ref_put(&file->f_ref)) { struct task_struct *task = current; if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { @@ -512,7 +531,7 @@ void fput(struct file *file) */ void __fput_sync(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) + if (file_ref_put(&file->f_ref)) __fput(file); } @@ -529,6 +548,11 @@ void __init files_init(void) filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); + + args.freeptr_offset = offsetof(struct backing_file, bf_freeptr); + bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file), + &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h index fbcd603365ad..8c67627f2a3d 100644 --- a/fs/freevxfs/vxfs_dir.h +++ b/fs/freevxfs/vxfs_dir.h @@ -25,7 +25,7 @@ struct vxfs_dirblk { __fs16 d_free; /* free space in dirblock */ __fs16 d_nhash; /* no of hash chains */ - __fs16 d_hash[1]; /* hash chain */ + __fs16 d_hash[]; /* hash chain */ }; /* diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d8bec3c1bb1f..3cd99e2dc6ac 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -290,7 +290,6 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } -EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list @@ -731,8 +730,9 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */ -void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) +static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); @@ -762,7 +762,24 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } -EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); + +/** + * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite + * @wbc: writeback_control of interest + * @inode: target inode + * + * This function is to be used by __filemap_fdatawrite_range(), which is an + * alternative entry point into writeback code, and first ensures @inode is + * associated with a bdi_writeback and attaches it to @wbc. + */ +void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode_attach_wb(inode, NULL); + wbc_attach_and_unlock_inode(wbc, inode); +} +EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection @@ -890,17 +907,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode); /** * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress - * @page: page being written out + * @folio: folio being written out * @bytes: number of bytes being written out * - * @bytes from @page are about to written out during the writeback + * @bytes from @folio are about to written out during the writeback * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ -void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes) { - struct folio *folio; struct cgroup_subsys_state *css; int id; @@ -913,7 +929,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, if (!wbc->wb || wbc->no_cgroup_owner) return; - folio = page_folio(page); css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) @@ -1227,6 +1242,13 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, } } +static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) +{ + spin_unlock(&inode->i_lock); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 24727ec34e5a..16fa61ef56bf 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -156,6 +156,7 @@ int fs_lookup_param(struct fs_context *fc, f = getname_kernel(param->string); if (IS_ERR(f)) return PTR_ERR(f); + param->dirfd = AT_FDCWD; put_f = true; break; case fs_value_is_filename: @@ -308,6 +309,26 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p, } EXPORT_SYMBOL(fs_param_is_fd); +int fs_param_is_file_or_string(struct p_log *log, + const struct fs_parameter_spec *p, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + switch (param->type) { + case fs_value_is_string: + return fs_param_is_string(log, p, param, result); + case fs_value_is_file: + result->uint_32 = param->dirfd; + if (result->uint_32 <= INT_MAX) + return 0; + break; + default: + break; + } + return fs_param_bad_value(log, param); +} +EXPORT_SYMBOL(fs_param_is_file_or_string); + int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { diff --git a/fs/fsopen.c b/fs/fsopen.c index 6cef3deccded..094a7f510edf 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -349,7 +349,6 @@ SYSCALL_DEFINE5(fsconfig, int, aux) { struct fs_context *fc; - struct fd f; int ret; int lookup_flags = 0; @@ -392,12 +391,11 @@ SYSCALL_DEFINE5(fsconfig, return -EOPNOTSUPP; } - f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return -EBADF; - ret = -EINVAL; if (fd_file(f)->f_op != &fscontext_fops) - goto out_f; + return -EINVAL; fc = fd_file(f)->private_data; if (fc->ops == &legacy_fs_context_ops) { @@ -407,17 +405,14 @@ SYSCALL_DEFINE5(fsconfig, case FSCONFIG_SET_PATH_EMPTY: case FSCONFIG_SET_FD: case FSCONFIG_CMD_CREATE_EXCL: - ret = -EOPNOTSUPP; - goto out_f; + return -EOPNOTSUPP; } } if (_key) { param.key = strndup_user(_key, 256); - if (IS_ERR(param.key)) { - ret = PTR_ERR(param.key); - goto out_f; - } + if (IS_ERR(param.key)) + return PTR_ERR(param.key); } switch (cmd) { @@ -496,7 +491,5 @@ SYSCALL_DEFINE5(fsconfig, } out_key: kfree(param.key); -out_f: - fdput(f); return ret; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1f64ae6d7a69..0723c6344b20 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2371,13 +2371,12 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) int res; int oldfd; struct fuse_dev *fud = NULL; - struct fd f; if (get_user(oldfd, argp)) return -EFAULT; - f = fdget(oldfd); - if (!fd_file(f)) + CLASS(fd, f)(oldfd); + if (fd_empty(f)) return -EINVAL; /* @@ -2394,7 +2393,6 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) mutex_unlock(&fuse_mutex); } - fdput(f); return res; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f33fbce86ae0..dafdf766b1d5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2288,6 +2288,13 @@ static int fuse_writepages_fill(struct folio *folio, struct folio *tmp_folio; int err; + if (!data->ff) { + err = -EIO; + data->ff = fuse_write_file_get(fi); + if (!data->ff) + goto out_unlock; + } + if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; @@ -2351,13 +2358,13 @@ static int fuse_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_wb_data data; int err; + err = -EIO; if (fuse_is_bad(inode)) - return -EIO; + goto out; if (wbc->sync_mode == WB_SYNC_NONE && fc->num_background >= fc->congestion_threshold) @@ -2365,9 +2372,7 @@ static int fuse_writepages(struct address_space *mapping, data.inode = inode; data.wpa = NULL; - data.ff = fuse_write_file_get(fi); - if (!data.ff) - return -EIO; + data.ff = NULL; err = -ENOMEM; data.orig_pages = kcalloc(fc->max_pages, @@ -2381,10 +2386,11 @@ static int fuse_writepages(struct address_space *mapping, WARN_ON(!data.wpa->ia.ap.num_pages); fuse_writepages_send(&data); } + if (data.ff) + fuse_file_put(data.ff, false); kfree(data.orig_pages); out: - fuse_file_put(data.ff, false); return err; } diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 62aee8289d11..bbac547dfcb3 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -18,11 +18,11 @@ static void fuse_file_accessed(struct file *file) fuse_invalidate_atime(inode); } -static void fuse_file_modified(struct file *file) +static void fuse_passthrough_end_write(struct file *file, loff_t pos, ssize_t ret) { struct inode *inode = file_inode(file); - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); + fuse_write_update_attr(inode, pos, ret); } ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter) @@ -63,7 +63,7 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct backing_file_ctx ctx = { .cred = ff->cred, .user_file = file, - .end_write = fuse_file_modified, + .end_write = fuse_passthrough_end_write, }; pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, @@ -110,7 +110,7 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, struct backing_file_ctx ctx = { .cred = ff->cred, .user_file = out, - .end_write = fuse_file_modified, + .end_write = fuse_passthrough_end_write, }; pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, @@ -234,7 +234,6 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) goto out; backing_sb = file_inode(file)->i_sb; - pr_info("%s: %x:%pD %i\n", __func__, backing_sb->s_dev, file, backing_sb->s_stack_depth); res = -ELOOP; if (backing_sb->s_stack_depth >= fc->max_stack_depth) goto out_fput; diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index d418d8b5367f..3334c394ce9c 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -190,6 +190,5 @@ const struct export_operations gfs2_export_ops = { .fh_to_parent = gfs2_fh_to_parent, .get_name = gfs2_get_name, .get_parent = gfs2_get_parent, - .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index f7dd64856c9b..1e73cf87ff88 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1586,6 +1586,7 @@ const struct file_operations gfs2_file_fops = { .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, + .fop_flags = FOP_ASYNC_LOCK, }; const struct file_operations gfs2_dir_fops = { @@ -1598,6 +1599,7 @@ const struct file_operations gfs2_dir_fops = { .lock = gfs2_lock, .flock = gfs2_flock, .llseek = default_llseek, + .fop_flags = FOP_ASYNC_LOCK, }; #endif /* CONFIG_GFS2_FS_LOCKING_DLM */ diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 269c3bc7fced..4701c4aafbf4 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -34,7 +34,6 @@ #include <linux/lockref.h> #include <linux/rhashtable.h> #include <linux/pid_namespace.h> -#include <linux/fdtable.h> #include <linux/file.h> #include "gfs2.h" @@ -2768,25 +2767,18 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) i->file = NULL; } - rcu_read_lock(); for(;; i->fd++) { - struct inode *inode; - - i->file = task_lookup_next_fdget_rcu(i->task, &i->fd); + i->file = fget_task_next(i->task, &i->fd); if (!i->file) { i->fd = 0; break; } - inode = file_inode(i->file); - if (inode->i_sb == i->sb) + if (file_inode(i->file)->i_sb == i->sb) break; - rcu_read_unlock(); fput(i->file); - rcu_read_lock(); } - rcu_read_unlock(); return i->file; } diff --git a/fs/hfs/super.c b/fs/hfs/super.c index eeac99765f0d..3bee9b5dba5e 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -15,10 +15,11 @@ #include <linux/module.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/nls.h> -#include <linux/parser.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/vfs.h> @@ -111,21 +112,24 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int hfs_remount(struct super_block *sb, int *flags, char *data) +static int hfs_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + sync_filesystem(sb); - *flags |= SB_NODIRATIME; - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + fc->sb_flags |= SB_NODIRATIME; + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (!(*flags & SB_RDONLY)) { + + if (!(fc->sb_flags & SB_RDONLY)) { if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } } return 0; @@ -180,7 +184,6 @@ static const struct super_operations hfs_super_operations = { .put_super = hfs_put_super, .sync_fs = hfs_sync_fs, .statfs = hfs_statfs, - .remount_fs = hfs_remount, .show_options = hfs_show_options, }; @@ -188,181 +191,112 @@ enum { opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask, opt_part, opt_session, opt_type, opt_creator, opt_quiet, opt_codepage, opt_iocharset, - opt_err }; -static const match_table_t tokens = { - { opt_uid, "uid=%u" }, - { opt_gid, "gid=%u" }, - { opt_umask, "umask=%o" }, - { opt_file_umask, "file_umask=%o" }, - { opt_dir_umask, "dir_umask=%o" }, - { opt_part, "part=%u" }, - { opt_session, "session=%u" }, - { opt_type, "type=%s" }, - { opt_creator, "creator=%s" }, - { opt_quiet, "quiet" }, - { opt_codepage, "codepage=%s" }, - { opt_iocharset, "iocharset=%s" }, - { opt_err, NULL } +static const struct fs_parameter_spec hfs_param_spec[] = { + fsparam_u32 ("uid", opt_uid), + fsparam_u32 ("gid", opt_gid), + fsparam_u32oct ("umask", opt_umask), + fsparam_u32oct ("file_umask", opt_file_umask), + fsparam_u32oct ("dir_umask", opt_dir_umask), + fsparam_u32 ("part", opt_part), + fsparam_u32 ("session", opt_session), + fsparam_string ("type", opt_type), + fsparam_string ("creator", opt_creator), + fsparam_flag ("quiet", opt_quiet), + fsparam_string ("codepage", opt_codepage), + fsparam_string ("iocharset", opt_iocharset), + {} }; -static inline int match_fourchar(substring_t *arg, u32 *result) -{ - if (arg->to - arg->from != 4) - return -EINVAL; - memcpy(result, arg->from, 4); - return 0; -} - /* - * parse_options() + * hfs_parse_param() * - * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger - * This function is called by hfs_read_super() to parse the mount options. + * This function is called by the vfs to parse the mount options. */ -static int parse_options(char *options, struct hfs_sb_info *hsb) +static int hfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int tmp, token; - - /* initialize the sb with defaults */ - hsb->s_uid = current_uid(); - hsb->s_gid = current_gid(); - hsb->s_file_umask = 0133; - hsb->s_dir_umask = 0022; - hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */ - hsb->s_quiet = 0; - hsb->part = -1; - hsb->session = -1; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case opt_uid: - if (match_int(&args[0], &tmp)) { - pr_err("uid requires an argument\n"); - return 0; - } - hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp); - if (!uid_valid(hsb->s_uid)) { - pr_err("invalid uid %d\n", tmp); - return 0; - } - break; - case opt_gid: - if (match_int(&args[0], &tmp)) { - pr_err("gid requires an argument\n"); - return 0; - } - hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp); - if (!gid_valid(hsb->s_gid)) { - pr_err("invalid gid %d\n", tmp); - return 0; - } - break; - case opt_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("umask requires a value\n"); - return 0; - } - hsb->s_file_umask = (umode_t)tmp; - hsb->s_dir_umask = (umode_t)tmp; - break; - case opt_file_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("file_umask requires a value\n"); - return 0; - } - hsb->s_file_umask = (umode_t)tmp; - break; - case opt_dir_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("dir_umask requires a value\n"); - return 0; - } - hsb->s_dir_umask = (umode_t)tmp; - break; - case opt_part: - if (match_int(&args[0], &hsb->part)) { - pr_err("part requires an argument\n"); - return 0; - } - break; - case opt_session: - if (match_int(&args[0], &hsb->session)) { - pr_err("session requires an argument\n"); - return 0; - } - break; - case opt_type: - if (match_fourchar(&args[0], &hsb->s_type)) { - pr_err("type requires a 4 character value\n"); - return 0; - } - break; - case opt_creator: - if (match_fourchar(&args[0], &hsb->s_creator)) { - pr_err("creator requires a 4 character value\n"); - return 0; - } - break; - case opt_quiet: - hsb->s_quiet = 1; - break; - case opt_codepage: - if (hsb->nls_disk) { - pr_err("unable to change codepage\n"); - return 0; - } - p = match_strdup(&args[0]); - if (p) - hsb->nls_disk = load_nls(p); - if (!hsb->nls_disk) { - pr_err("unable to load codepage \"%s\"\n", p); - kfree(p); - return 0; - } - kfree(p); - break; - case opt_iocharset: - if (hsb->nls_io) { - pr_err("unable to change iocharset\n"); - return 0; - } - p = match_strdup(&args[0]); - if (p) - hsb->nls_io = load_nls(p); - if (!hsb->nls_io) { - pr_err("unable to load iocharset \"%s\"\n", p); - kfree(p); - return 0; - } - kfree(p); - break; - default: - return 0; - } - } + struct hfs_sb_info *hsb = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + /* hfs does not honor any fs-specific options on remount */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + return 0; - if (hsb->nls_disk && !hsb->nls_io) { - hsb->nls_io = load_nls_default(); + opt = fs_parse(fc, hfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case opt_uid: + hsb->s_uid = result.uid; + break; + case opt_gid: + hsb->s_gid = result.gid; + break; + case opt_umask: + hsb->s_file_umask = (umode_t)result.uint_32; + hsb->s_dir_umask = (umode_t)result.uint_32; + break; + case opt_file_umask: + hsb->s_file_umask = (umode_t)result.uint_32; + break; + case opt_dir_umask: + hsb->s_dir_umask = (umode_t)result.uint_32; + break; + case opt_part: + hsb->part = result.uint_32; + break; + case opt_session: + hsb->session = result.uint_32; + break; + case opt_type: + if (strlen(param->string) != 4) { + pr_err("type requires a 4 character value\n"); + return -EINVAL; + } + memcpy(&hsb->s_type, param->string, 4); + break; + case opt_creator: + if (strlen(param->string) != 4) { + pr_err("creator requires a 4 character value\n"); + return -EINVAL; + } + memcpy(&hsb->s_creator, param->string, 4); + break; + case opt_quiet: + hsb->s_quiet = 1; + break; + case opt_codepage: + if (hsb->nls_disk) { + pr_err("unable to change codepage\n"); + return -EINVAL; + } + hsb->nls_disk = load_nls(param->string); + if (!hsb->nls_disk) { + pr_err("unable to load codepage \"%s\"\n", + param->string); + return -EINVAL; + } + break; + case opt_iocharset: + if (hsb->nls_io) { + pr_err("unable to change iocharset\n"); + return -EINVAL; + } + hsb->nls_io = load_nls(param->string); if (!hsb->nls_io) { - pr_err("unable to load default iocharset\n"); - return 0; + pr_err("unable to load iocharset \"%s\"\n", + param->string); + return -EINVAL; } + break; + default: + return -EINVAL; } - hsb->s_dir_umask &= 0777; - hsb->s_file_umask &= 0577; - return 1; + return 0; } /* @@ -376,29 +310,25 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) * hfs_btree_init() to get the necessary data about the extents and * catalog B-trees and, finally, reading the root inode into memory. */ -static int hfs_fill_super(struct super_block *sb, void *data, int silent) +static int hfs_fill_super(struct super_block *sb, struct fs_context *fc) { - struct hfs_sb_info *sbi; + struct hfs_sb_info *sbi = HFS_SB(sb); struct hfs_find_data fd; hfs_cat_rec rec; struct inode *root_inode; + int silent = fc->sb_flags & SB_SILENT; int res; - sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; + /* load_nls_default does not fail */ + if (sbi->nls_disk && !sbi->nls_io) + sbi->nls_io = load_nls_default(); + sbi->s_dir_umask &= 0777; + sbi->s_file_umask &= 0577; - sbi->sb = sb; - sb->s_fs_info = sbi; spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb); - res = -EINVAL; - if (!parse_options((char *)data, sbi)) { - pr_err("unable to parse mount options\n"); - goto bail; - } - + sbi->sb = sb; sb->s_op = &hfs_super_operations; sb->s_xattr = hfs_xattr_handlers; sb->s_flags |= SB_NODIRATIME; @@ -451,18 +381,56 @@ bail: return res; } -static struct dentry *hfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, hfs_fill_super); +} + +static void hfs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations hfs_context_ops = { + .parse_param = hfs_parse_param, + .get_tree = hfs_get_tree, + .reconfigure = hfs_reconfigure, + .free = hfs_free_fc, +}; + +static int hfs_init_fs_context(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super); + struct hfs_sb_info *hsb; + + hsb = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); + if (!hsb) + return -ENOMEM; + + fc->s_fs_info = hsb; + fc->ops = &hfs_context_ops; + + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) { + /* initialize options with defaults */ + hsb->s_uid = current_uid(); + hsb->s_gid = current_gid(); + hsb->s_file_umask = 0133; + hsb->s_dir_umask = 0022; + hsb->s_type = cpu_to_be32(0x3f3f3f3f); /* == '????' */ + hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */ + hsb->s_quiet = 0; + hsb->part = -1; + hsb->session = -1; + } + + return 0; } static struct file_system_type hfs_fs_type = { .owner = THIS_MODULE, .name = "hfs", - .mount = hfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = hfs_init_fs_context, }; MODULE_ALIAS_FS("hfs"); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 59ce81dca73f..2f089bff0095 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -21,6 +21,7 @@ #include <linux/mutex.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> +#include <linux/fs_context.h> #include "hfsplus_raw.h" #define DBG_BNODE_REFS 0x00000001 @@ -156,6 +157,7 @@ struct hfsplus_sb_info { /* Runtime variables */ u32 blockoffset; + u32 min_io_size; sector_t part_start; sector_t sect_count; int fs_shift; @@ -307,7 +309,7 @@ struct hfsplus_readdir_data { */ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) { - return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev), + return max_t(unsigned short, HFSPLUS_SB(sb)->min_io_size, HFSPLUS_SECTOR_SIZE); } @@ -496,8 +498,7 @@ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); /* options.c */ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts); -int hfsplus_parse_options_remount(char *input, int *force); -int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi); +int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param); int hfsplus_show_options(struct seq_file *seq, struct dentry *root); /* part_tbl.c */ diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index c94a58762ad6..a66a09a56bf7 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -12,7 +12,8 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/nls.h> #include <linux/mount.h> #include <linux/seq_file.h> @@ -23,26 +24,23 @@ enum { opt_creator, opt_type, opt_umask, opt_uid, opt_gid, opt_part, opt_session, opt_nls, - opt_nodecompose, opt_decompose, - opt_barrier, opt_nobarrier, - opt_force, opt_err + opt_decompose, opt_barrier, + opt_force, }; -static const match_table_t tokens = { - { opt_creator, "creator=%s" }, - { opt_type, "type=%s" }, - { opt_umask, "umask=%o" }, - { opt_uid, "uid=%u" }, - { opt_gid, "gid=%u" }, - { opt_part, "part=%u" }, - { opt_session, "session=%u" }, - { opt_nls, "nls=%s" }, - { opt_decompose, "decompose" }, - { opt_nodecompose, "nodecompose" }, - { opt_barrier, "barrier" }, - { opt_nobarrier, "nobarrier" }, - { opt_force, "force" }, - { opt_err, NULL } +static const struct fs_parameter_spec hfs_param_spec[] = { + fsparam_string ("creator", opt_creator), + fsparam_string ("type", opt_type), + fsparam_u32oct ("umask", opt_umask), + fsparam_u32 ("uid", opt_uid), + fsparam_u32 ("gid", opt_gid), + fsparam_u32 ("part", opt_part), + fsparam_u32 ("session", opt_session), + fsparam_string ("nls", opt_nls), + fsparam_flag_no ("decompose", opt_decompose), + fsparam_flag_no ("barrier", opt_barrier), + fsparam_flag ("force", opt_force), + {} }; /* Initialize an options object to reasonable defaults */ @@ -60,162 +58,89 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts) opts->session = -1; } -/* convert a "four byte character" to a 32 bit int with error checks */ -static inline int match_fourchar(substring_t *arg, u32 *result) +/* Parse options from mount. Returns nonzero errno on failure */ +int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param) { - if (arg->to - arg->from != 4) - return -EINVAL; - memcpy(result, arg->from, 4); - return 0; -} - -int hfsplus_parse_options_remount(char *input, int *force) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int token; - - if (!input) - return 1; - - while ((p = strsep(&input, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case opt_force: - *force = 1; - break; - default: - break; + struct hfsplus_sb_info *sbi = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + /* + * Only the force option is examined during remount, all others + * are ignored. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + strncmp(param->key, "force", 5)) + return 0; + + opt = fs_parse(fc, hfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case opt_creator: + if (strlen(param->string) != 4) { + pr_err("creator requires a 4 character value\n"); + return -EINVAL; } - } - - return 1; -} - -/* Parse options from mount. Returns 0 on failure */ -/* input is the options passed to mount() as a string */ -int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int tmp, token; - - if (!input) - goto done; - - while ((p = strsep(&input, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case opt_creator: - if (match_fourchar(&args[0], &sbi->creator)) { - pr_err("creator requires a 4 character value\n"); - return 0; - } - break; - case opt_type: - if (match_fourchar(&args[0], &sbi->type)) { - pr_err("type requires a 4 character value\n"); - return 0; - } - break; - case opt_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("umask requires a value\n"); - return 0; - } - sbi->umask = (umode_t)tmp; - break; - case opt_uid: - if (match_int(&args[0], &tmp)) { - pr_err("uid requires an argument\n"); - return 0; - } - sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp); - if (!uid_valid(sbi->uid)) { - pr_err("invalid uid specified\n"); - return 0; - } else { - set_bit(HFSPLUS_SB_UID, &sbi->flags); - } - break; - case opt_gid: - if (match_int(&args[0], &tmp)) { - pr_err("gid requires an argument\n"); - return 0; - } - sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp); - if (!gid_valid(sbi->gid)) { - pr_err("invalid gid specified\n"); - return 0; - } else { - set_bit(HFSPLUS_SB_GID, &sbi->flags); - } - break; - case opt_part: - if (match_int(&args[0], &sbi->part)) { - pr_err("part requires an argument\n"); - return 0; - } - break; - case opt_session: - if (match_int(&args[0], &sbi->session)) { - pr_err("session requires an argument\n"); - return 0; - } - break; - case opt_nls: - if (sbi->nls) { - pr_err("unable to change nls mapping\n"); - return 0; - } - p = match_strdup(&args[0]); - if (p) - sbi->nls = load_nls(p); - if (!sbi->nls) { - pr_err("unable to load nls mapping \"%s\"\n", - p); - kfree(p); - return 0; - } - kfree(p); - break; - case opt_decompose: - clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); - break; - case opt_nodecompose: + memcpy(&sbi->creator, param->string, 4); + break; + case opt_type: + if (strlen(param->string) != 4) { + pr_err("type requires a 4 character value\n"); + return -EINVAL; + } + memcpy(&sbi->type, param->string, 4); + break; + case opt_umask: + sbi->umask = (umode_t)result.uint_32; + break; + case opt_uid: + sbi->uid = result.uid; + set_bit(HFSPLUS_SB_UID, &sbi->flags); + break; + case opt_gid: + sbi->gid = result.gid; + set_bit(HFSPLUS_SB_GID, &sbi->flags); + break; + case opt_part: + sbi->part = result.uint_32; + break; + case opt_session: + sbi->session = result.uint_32; + break; + case opt_nls: + if (sbi->nls) { + pr_err("unable to change nls mapping\n"); + return -EINVAL; + } + sbi->nls = load_nls(param->string); + if (!sbi->nls) { + pr_err("unable to load nls mapping \"%s\"\n", + param->string); + return -EINVAL; + } + break; + case opt_decompose: + if (result.negated) set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); - break; - case opt_barrier: - clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); - break; - case opt_nobarrier: + else + clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); + break; + case opt_barrier: + if (result.negated) set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); - break; - case opt_force: - set_bit(HFSPLUS_SB_FORCE, &sbi->flags); - break; - default: - return 0; - } - } - -done: - if (!sbi->nls) { - /* try utf8 first, as this is the old default behaviour */ - sbi->nls = load_nls("utf8"); - if (!sbi->nls) - sbi->nls = load_nls_default(); - if (!sbi->nls) - return 0; + else + clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); + break; + case opt_force: + set_bit(HFSPLUS_SB_FORCE, &sbi->flags); + break; + default: + return -EINVAL; } - return 1; + return 0; } int hfsplus_show_options(struct seq_file *seq, struct dentry *root) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 97920202790f..948b8aaee33e 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -14,6 +14,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/fs.h> +#include <linux/fs_context.h> #include <linux/slab.h> #include <linux/vfs.h> #include <linux/nls.h> @@ -332,34 +333,33 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int hfsplus_remount(struct super_block *sb, int *flags, char *data) +static int hfsplus_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + sync_filesystem(sb); - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (!(*flags & SB_RDONLY)) { - struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; - int force = 0; - - if (!hfsplus_parse_options_remount(data, &force)) - return -EINVAL; + if (!(fc->sb_flags & SB_RDONLY)) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; - } else if (force) { + fc->sb_flags |= SB_RDONLY; + } else if (test_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { pr_warn("filesystem is marked journaled, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } } return 0; @@ -373,38 +373,33 @@ static const struct super_operations hfsplus_sops = { .put_super = hfsplus_put_super, .sync_fs = hfsplus_sync_fs, .statfs = hfsplus_statfs, - .remount_fs = hfsplus_remount, .show_options = hfsplus_show_options, }; -static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) +static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc) { struct hfsplus_vh *vhdr; - struct hfsplus_sb_info *sbi; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); hfsplus_cat_entry entry; struct hfs_find_data fd; struct inode *root, *inode; struct qstr str; - struct nls_table *nls = NULL; + struct nls_table *nls; u64 last_fs_block, last_fs_page; + int silent = fc->sb_flags & SB_SILENT; int err; - err = -ENOMEM; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - goto out; - - sb->s_fs_info = sbi; mutex_init(&sbi->alloc_mutex); mutex_init(&sbi->vh_mutex); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); - hfsplus_fill_defaults(sbi); err = -EINVAL; - if (!hfsplus_parse_options(data, sbi)) { - pr_err("unable to parse mount options\n"); - goto out_unload_nls; + if (!sbi->nls) { + /* try utf8 first, as this is the old default behaviour */ + sbi->nls = load_nls("utf8"); + if (!sbi->nls) + sbi->nls = load_nls_default(); } /* temporarily use utf8 to correctly find the hidden dir below */ @@ -616,7 +611,6 @@ out_unload_nls: unload_nls(sbi->nls); unload_nls(nls); kfree(sbi); -out: return err; } @@ -641,18 +635,46 @@ static void hfsplus_free_inode(struct inode *inode) #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) -static struct dentry *hfsplus_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hfsplus_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, hfsplus_fill_super); +} + +static void hfsplus_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super); + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations hfsplus_context_ops = { + .parse_param = hfsplus_parse_param, + .get_tree = hfsplus_get_tree, + .reconfigure = hfsplus_reconfigure, + .free = hfsplus_free_fc, +}; + +static int hfsplus_init_fs_context(struct fs_context *fc) +{ + struct hfsplus_sb_info *sbi; + + sbi = kzalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) + hfsplus_fill_defaults(sbi); + + fc->s_fs_info = sbi; + fc->ops = &hfsplus_context_ops; + + return 0; } static struct file_system_type hfsplus_fs_type = { .owner = THIS_MODULE, .name = "hfsplus", - .mount = hfsplus_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = hfsplus_init_fs_context, }; MODULE_ALIAS_FS("hfsplus"); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 9592ffcb44e5..74801911bc1c 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -172,6 +172,8 @@ int hfsplus_read_wrapper(struct super_block *sb) if (!blocksize) goto out; + sbi->min_io_size = blocksize; + if (hfsplus_get_last_session(sb, &part_start, &part_size)) goto out; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index e73717daa5f9..27567920abe4 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -9,7 +9,8 @@ #include "hpfs_fn.h" #include <linux/module.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/init.h> #include <linux/statfs.h> #include <linux/magic.h> @@ -90,7 +91,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...) hpfs_sb(s)->sb_was_error = 1; } -/* +/* * A little trick to detect cycles in many hpfs structures and don't let the * kernel crash on corrupted filesystem. When first called, set c2 to 0. * @@ -272,146 +273,70 @@ static void destroy_inodecache(void) kmem_cache_destroy(hpfs_inode_cachep); } -/* - * A tiny parser for option strings, stolen from dosfs. - * Stolen again from read-only hpfs. - * And updated for table-driven option parsing. - */ - enum { - Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case_lower, Opt_case_asis, - Opt_check_none, Opt_check_normal, Opt_check_strict, - Opt_err_cont, Opt_err_ro, Opt_err_panic, - Opt_eas_no, Opt_eas_ro, Opt_eas_rw, - Opt_chkdsk_no, Opt_chkdsk_errors, Opt_chkdsk_always, - Opt_timeshift, Opt_err, + Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case, + Opt_check, Opt_err, Opt_eas, Opt_chkdsk, Opt_timeshift, }; -static const match_table_t tokens = { - {Opt_help, "help"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_umask, "umask=%o"}, - {Opt_case_lower, "case=lower"}, - {Opt_case_asis, "case=asis"}, - {Opt_check_none, "check=none"}, - {Opt_check_normal, "check=normal"}, - {Opt_check_strict, "check=strict"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_err_panic, "errors=panic"}, - {Opt_eas_no, "eas=no"}, - {Opt_eas_ro, "eas=ro"}, - {Opt_eas_rw, "eas=rw"}, - {Opt_chkdsk_no, "chkdsk=no"}, - {Opt_chkdsk_errors, "chkdsk=errors"}, - {Opt_chkdsk_always, "chkdsk=always"}, - {Opt_timeshift, "timeshift=%d"}, - {Opt_err, NULL}, +static const struct constant_table hpfs_param_case[] = { + {"asis", 0}, + {"lower", 1}, + {} }; -static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask, - int *lowercase, int *eas, int *chk, int *errs, - int *chkdsk, int *timeshift) -{ - char *p; - int option; +static const struct constant_table hpfs_param_check[] = { + {"none", 0}, + {"normal", 1}, + {"strict", 2}, + {} +}; - if (!opts) - return 1; +static const struct constant_table hpfs_param_err[] = { + {"continue", 0}, + {"remount-ro", 1}, + {"panic", 2}, + {} +}; - /*pr_info("Parsing opts: '%s'\n",opts);*/ - - while ((p = strsep(&opts, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_help: - return 2; - case Opt_uid: - if (match_int(args, &option)) - return 0; - *uid = make_kuid(current_user_ns(), option); - if (!uid_valid(*uid)) - return 0; - break; - case Opt_gid: - if (match_int(args, &option)) - return 0; - *gid = make_kgid(current_user_ns(), option); - if (!gid_valid(*gid)) - return 0; - break; - case Opt_umask: - if (match_octal(args, &option)) - return 0; - *umask = option; - break; - case Opt_case_lower: - *lowercase = 1; - break; - case Opt_case_asis: - *lowercase = 0; - break; - case Opt_check_none: - *chk = 0; - break; - case Opt_check_normal: - *chk = 1; - break; - case Opt_check_strict: - *chk = 2; - break; - case Opt_err_cont: - *errs = 0; - break; - case Opt_err_ro: - *errs = 1; - break; - case Opt_err_panic: - *errs = 2; - break; - case Opt_eas_no: - *eas = 0; - break; - case Opt_eas_ro: - *eas = 1; - break; - case Opt_eas_rw: - *eas = 2; - break; - case Opt_chkdsk_no: - *chkdsk = 0; - break; - case Opt_chkdsk_errors: - *chkdsk = 1; - break; - case Opt_chkdsk_always: - *chkdsk = 2; - break; - case Opt_timeshift: - { - int m = 1; - char *rhs = args[0].from; - if (!rhs || !*rhs) - return 0; - if (*rhs == '-') m = -1; - if (*rhs == '+' || *rhs == '-') rhs++; - *timeshift = simple_strtoul(rhs, &rhs, 0) * m; - if (*rhs) - return 0; - break; - } - default: - return 0; - } - } - return 1; -} +static const struct constant_table hpfs_param_eas[] = { + {"no", 0}, + {"ro", 1}, + {"rw", 2}, + {} +}; + +static const struct constant_table hpfs_param_chkdsk[] = { + {"no", 0}, + {"errors", 1}, + {"always", 2}, + {} +}; + +static const struct fs_parameter_spec hpfs_param_spec[] = { + fsparam_flag ("help", Opt_help), + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_u32oct ("umask", Opt_umask), + fsparam_enum ("case", Opt_case, hpfs_param_case), + fsparam_enum ("check", Opt_check, hpfs_param_check), + fsparam_enum ("errors", Opt_err, hpfs_param_err), + fsparam_enum ("eas", Opt_eas, hpfs_param_eas), + fsparam_enum ("chkdsk", Opt_chkdsk, hpfs_param_chkdsk), + fsparam_s32 ("timeshift", Opt_timeshift), + {} +}; + +struct hpfs_fc_context { + kuid_t uid; + kgid_t gid; + umode_t umask; + int lowercase; + int eas; + int chk; + int errs; + int chkdsk; + int timeshift; +}; static inline void hpfs_help(void) { @@ -439,49 +364,92 @@ HPFS filesystem options:\n\ \n"); } -static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) +static int hpfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - kuid_t uid; - kgid_t gid; - umode_t umask; - int lowercase, eas, chk, errs, chkdsk, timeshift; - int o; + struct hpfs_fc_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, hpfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_help: + hpfs_help(); + return -EINVAL; + case Opt_uid: + ctx->uid = result.uid; + break; + case Opt_gid: + ctx->gid = result.gid; + break; + case Opt_umask: + ctx->umask = result.uint_32; + break; + case Opt_case: + ctx->lowercase = result.uint_32; + break; + case Opt_check: + ctx->chk = result.uint_32; + break; + case Opt_err: + ctx->errs = result.uint_32; + break; + case Opt_eas: + ctx->eas = result.uint_32; + break; + case Opt_chkdsk: + ctx->chkdsk = result.uint_32; + break; + case Opt_timeshift: + { + int m = 1; + char *rhs = param->string; + int timeshift; + + if (*rhs == '-') m = -1; + if (*rhs == '+' || *rhs == '-') rhs++; + timeshift = simple_strtoul(rhs, &rhs, 0) * m; + if (*rhs) + return -EINVAL; + ctx->timeshift = timeshift; + break; + } + default: + return -EINVAL; + } + + return 0; +} + +static int hpfs_reconfigure(struct fs_context *fc) +{ + struct hpfs_fc_context *ctx = fc->fs_private; + struct super_block *s = fc->root->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); sync_filesystem(s); - *flags |= SB_NOATIME; + fc->sb_flags |= SB_NOATIME; hpfs_lock(s); - uid = sbi->sb_uid; gid = sbi->sb_gid; - umask = 0777 & ~sbi->sb_mode; - lowercase = sbi->sb_lowercase; - eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk; - errs = sbi->sb_err; timeshift = sbi->sb_timeshift; - - if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase, - &eas, &chk, &errs, &chkdsk, ×hift))) { - pr_err("bad mount options.\n"); - goto out_err; - } - if (o == 2) { - hpfs_help(); - goto out_err; - } - if (timeshift != sbi->sb_timeshift) { + + if (ctx->timeshift != sbi->sb_timeshift) { pr_err("timeshift can't be changed using remount.\n"); goto out_err; } unmark_dirty(s); - sbi->sb_uid = uid; sbi->sb_gid = gid; - sbi->sb_mode = 0777 & ~umask; - sbi->sb_lowercase = lowercase; - sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk; - sbi->sb_err = errs; sbi->sb_timeshift = timeshift; + sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid; + sbi->sb_mode = 0777 & ~ctx->umask; + sbi->sb_lowercase = ctx->lowercase; + sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk; + sbi->sb_chkdsk = ctx->chkdsk; + sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift; - if (!(*flags & SB_RDONLY)) mark_dirty(s, 1); + if (!(fc->sb_flags & SB_RDONLY)) mark_dirty(s, 1); hpfs_unlock(s); return 0; @@ -530,30 +498,24 @@ static const struct super_operations hpfs_sops = .evict_inode = hpfs_evict_inode, .put_super = hpfs_put_super, .statfs = hpfs_statfs, - .remount_fs = hpfs_remount_fs, .show_options = hpfs_show_options, }; -static int hpfs_fill_super(struct super_block *s, void *options, int silent) +static int hpfs_fill_super(struct super_block *s, struct fs_context *fc) { + struct hpfs_fc_context *ctx = fc->fs_private; struct buffer_head *bh0, *bh1, *bh2; struct hpfs_boot_block *bootblock; struct hpfs_super_block *superblock; struct hpfs_spare_block *spareblock; struct hpfs_sb_info *sbi; struct inode *root; - - kuid_t uid; - kgid_t gid; - umode_t umask; - int lowercase, eas, chk, errs, chkdsk, timeshift; + int silent = fc->sb_flags & SB_SILENT; dnode_secno root_dno; struct hpfs_dirent *de = NULL; struct quad_buffer_head qbh; - int o; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) { return -ENOMEM; @@ -563,26 +525,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) mutex_init(&sbi->hpfs_mutex); hpfs_lock(s); - uid = current_uid(); - gid = current_gid(); - umask = current_umask(); - lowercase = 0; - eas = 2; - chk = 1; - errs = 1; - chkdsk = 1; - timeshift = 0; - - if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase, - &eas, &chk, &errs, &chkdsk, ×hift))) { - pr_err("bad mount options.\n"); - goto bail0; - } - if (o==2) { - hpfs_help(); - goto bail0; - } - /*sbi->sb_mounting = 1;*/ sb_set_blocksize(s, 512); sbi->sb_fs_size = -1; @@ -622,17 +564,17 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start); sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band); sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap); - sbi->sb_uid = uid; - sbi->sb_gid = gid; - sbi->sb_mode = 0777 & ~umask; + sbi->sb_uid = ctx->uid; + sbi->sb_gid = ctx->gid; + sbi->sb_mode = 0777 & ~ctx->umask; sbi->sb_n_free = -1; sbi->sb_n_free_dnodes = -1; - sbi->sb_lowercase = lowercase; - sbi->sb_eas = eas; - sbi->sb_chk = chk; - sbi->sb_chkdsk = chkdsk; - sbi->sb_err = errs; - sbi->sb_timeshift = timeshift; + sbi->sb_lowercase = ctx->lowercase; + sbi->sb_eas = ctx->eas; + sbi->sb_chk = ctx->chk; + sbi->sb_chkdsk = ctx->chkdsk; + sbi->sb_err = ctx->errs; + sbi->sb_timeshift = ctx->timeshift; sbi->sb_was_error = 0; sbi->sb_cp_table = NULL; sbi->sb_c_bitmap = -1; @@ -653,7 +595,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) /* Check for general fs errors*/ if (spareblock->dirty && !spareblock->old_wrote) { - if (errs == 2) { + if (sbi->sb_err == 2) { pr_err("Improperly stopped, not mounted\n"); goto bail4; } @@ -667,16 +609,16 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) } if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { - if (errs >= 2) { + if (sbi->sb_err >= 2) { pr_err("Spare dnodes used, try chkdsk\n"); mark_dirty(s, 0); goto bail4; } hpfs_error(s, "warning: spare dnodes used, try chkdsk"); - if (errs == 0) + if (sbi->sb_err == 0) pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); } - if (chk) { + if (sbi->sb_chk) { unsigned a; if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) || le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) { @@ -755,18 +697,70 @@ bail0: return -EINVAL; } -static struct dentry *hpfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hpfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, hpfs_fill_super); +} + +static void hpfs_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super); + kfree(fc->fs_private); } +static const struct fs_context_operations hpfs_fc_context_ops = { + .parse_param = hpfs_parse_param, + .get_tree = hpfs_get_tree, + .reconfigure = hpfs_reconfigure, + .free = hpfs_free_fc, +}; + +static int hpfs_init_fs_context(struct fs_context *fc) +{ + struct hpfs_fc_context *ctx; + + ctx = kzalloc(sizeof(struct hpfs_fc_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct hpfs_sb_info *sbi = hpfs_sb(sb); + + ctx->uid = sbi->sb_uid; + ctx->gid = sbi->sb_gid; + ctx->umask = 0777 & ~sbi->sb_mode; + ctx->lowercase = sbi->sb_lowercase; + ctx->eas = sbi->sb_eas; + ctx->chk = sbi->sb_chk; + ctx->chkdsk = sbi->sb_chkdsk; + ctx->errs = sbi->sb_err; + ctx->timeshift = sbi->sb_timeshift; + + } else { + ctx->uid = current_uid(); + ctx->gid = current_gid(); + ctx->umask = current_umask(); + ctx->lowercase = 0; + ctx->eas = 2; + ctx->chk = 1; + ctx->errs = 1; + ctx->chkdsk = 1; + ctx->timeshift = 0; + } + + fc->fs_private = ctx; + fc->ops = &hpfs_fc_context_ops; + + return 0; +}; + static struct file_system_type hpfs_fs_type = { .owner = THIS_MODULE, .name = "hpfs", - .mount = hpfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = hpfs_init_fs_context, + .parameters = hpfs_param_spec, }; MODULE_ALIAS_FS("hpfs"); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5cf327337e22..1bbf783b244a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -39,6 +39,9 @@ #include <linux/uaccess.h> #include <linux/sched/mm.h> +#define CREATE_TRACE_POINTS +#include <trace/events/hugetlbfs.h> + static const struct address_space_operations hugetlbfs_aops; static const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; @@ -110,7 +113,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); + vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND | VM_MTE_ALLOWED); vma->vm_ops = &hugetlb_vm_ops; ret = seal_check_write(info->seals, vma); @@ -687,6 +690,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; + trace_hugetlbfs_evict_inode(inode); remove_inode_hugepages(inode, 0, LLONG_MAX); /* @@ -814,8 +818,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - if (mode & FALLOC_FL_PUNCH_HOLE) - return hugetlbfs_punch_hole(inode, offset, len); + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = hugetlbfs_punch_hole(inode, offset, len); + goto out_nolock; + } /* * Default preallocate case. @@ -919,6 +925,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, inode_set_ctime_current(inode); out: inode_unlock(inode); + +out_nolock: + trace_hugetlbfs_fallocate(inode, mode, offset, len, error); return error; } @@ -935,6 +944,8 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, if (error) return error; + trace_hugetlbfs_setattr(inode, dentry, attr); + if (ia_valid & ATTR_SIZE) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; @@ -1033,6 +1044,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); + trace_hugetlbfs_alloc_inode(inode, dir, mode); } else { if (resv_map) kref_put(&resv_map->refs, resv_map_release); @@ -1272,6 +1284,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) static void hugetlbfs_free_inode(struct inode *inode) { + trace_hugetlbfs_free_inode(inode); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } diff --git a/fs/inode.c b/fs/inode.c index 8dabb224f941..b13b778257ae 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -21,7 +21,12 @@ #include <linux/list_lru.h> #include <linux/iversion.h> #include <linux/rw_hint.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> #include <trace/events/writeback.h> +#define CREATE_TRACE_POINTS +#include <trace/events/timestamp.h> + #include "internal.h" /* @@ -98,6 +103,70 @@ long get_nr_dirty_inodes(void) return nr_dirty > 0 ? nr_dirty : 0; } +#ifdef CONFIG_DEBUG_FS +static DEFINE_PER_CPU(long, mg_ctime_updates); +static DEFINE_PER_CPU(long, mg_fine_stamps); +static DEFINE_PER_CPU(long, mg_ctime_swaps); + +static unsigned long get_mg_ctime_updates(void) +{ + unsigned long sum = 0; + int i; + + for_each_possible_cpu(i) + sum += data_race(per_cpu(mg_ctime_updates, i)); + return sum; +} + +static unsigned long get_mg_fine_stamps(void) +{ + unsigned long sum = 0; + int i; + + for_each_possible_cpu(i) + sum += data_race(per_cpu(mg_fine_stamps, i)); + return sum; +} + +static unsigned long get_mg_ctime_swaps(void) +{ + unsigned long sum = 0; + int i; + + for_each_possible_cpu(i) + sum += data_race(per_cpu(mg_ctime_swaps, i)); + return sum; +} + +#define mgtime_counter_inc(__var) this_cpu_inc(__var) + +static int mgts_show(struct seq_file *s, void *p) +{ + unsigned long ctime_updates = get_mg_ctime_updates(); + unsigned long ctime_swaps = get_mg_ctime_swaps(); + unsigned long fine_stamps = get_mg_fine_stamps(); + unsigned long floor_swaps = timekeeping_get_mg_floor_swaps(); + + seq_printf(s, "%lu %lu %lu %lu\n", + ctime_updates, ctime_swaps, fine_stamps, floor_swaps); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(mgts); + +static int __init mg_debugfs_init(void) +{ + debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops); + return 0; +} +late_initcall(mg_debugfs_init); + +#else /* ! CONFIG_DEBUG_FS */ + +#define mgtime_counter_inc(__var) do { } while (0) + +#endif /* CONFIG_DEBUG_FS */ + /* * Handle nr_inode sysctl */ @@ -174,6 +243,8 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp inode->i_opflags = 0; if (sb->s_xattr) inode->i_opflags |= IOP_XATTR; + if (sb->s_type->fs_flags & FS_MGTIME) + inode->i_opflags |= IOP_MGTIME; i_uid_write(inode, 0); i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); @@ -748,7 +819,7 @@ static void evict(struct inode *inode) * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ - smp_mb(); + smp_mb__after_spinlock(); inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); @@ -1241,16 +1312,15 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, - * and if present it is return it with an increased reference count. This is - * a variant of iget5_locked() for callers that don't want to fail on memory - * allocation of inode. + * and if present return it with an increased reference count. This is a + * variant of iget5_locked() that doesn't allocate an inode. * - * If the inode is not in cache, insert the pre-allocated inode to cache and + * If the inode is not present in the cache, insert the pre-allocated inode and * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note that both @test and @set are called with the inode_hash_lock held, so + * they can't sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), @@ -1314,16 +1384,16 @@ EXPORT_SYMBOL(inode_insert5); * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, - * and if present it is return it with an increased reference count. This is - * a generalized version of iget_locked() for file systems where the inode + * and if present return it with an increased reference count. This is a + * generalized version of iget_locked() for file systems where the inode * number is not sufficient for unique identification of an inode. * - * If the inode is not in cache, allocate a new inode and return it locked, - * hashed, and with the I_NEW flag set. The file system gets to fill it in - * before unlocking it via unlock_new_inode(). + * If the inode is not present in the cache, allocate and insert a new inode + * and return it locked, hashed, and with the I_NEW flag set. The file system + * gets to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note that both @test and @set are called with the inode_hash_lock held, so + * they can't sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), @@ -2211,19 +2281,58 @@ int file_remove_privs(struct file *file) } EXPORT_SYMBOL(file_remove_privs); +/** + * current_time - Return FS time (possibly fine-grained) + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs, as suitable for a ctime/mtime change. If the ctime is flagged + * as having been QUERIED, get a fine-grained timestamp, but don't update + * the floor. + * + * For a multigrain inode, this is effectively an estimate of the timestamp + * that a file would receive. An actual update must go through + * inode_set_ctime_current(). + */ +struct timespec64 current_time(struct inode *inode) +{ + struct timespec64 now; + u32 cns; + + ktime_get_coarse_real_ts64_mg(&now); + + if (!is_mgtime(inode)) + goto out; + + /* If nothing has queried it, then coarse time is fine */ + cns = smp_load_acquire(&inode->i_ctime_nsec); + if (cns & I_CTIME_QUERIED) { + /* + * If there is no apparent change, then get a fine-grained + * timestamp. + */ + if (now.tv_nsec == (cns & ~I_CTIME_QUERIED)) + ktime_get_real_ts64(&now); + } +out: + return timestamp_truncate(now, inode); +} +EXPORT_SYMBOL(current_time); + static int inode_needs_update_time(struct inode *inode) { + struct timespec64 now, ts; int sync_it = 0; - struct timespec64 now = current_time(inode); - struct timespec64 ts; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; + now = current_time(inode); + ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) - sync_it = S_MTIME; + sync_it |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) @@ -2600,6 +2709,16 @@ void inode_nohighmem(struct inode *inode) } EXPORT_SYMBOL(inode_nohighmem); +struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) +{ + trace_inode_set_ctime_to_ts(inode, &ts); + set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec); + inode->i_ctime_sec = ts.tv_sec; + inode->i_ctime_nsec = ts.tv_nsec; + return ts; +} +EXPORT_SYMBOL(inode_set_ctime_to_ts); + /** * timestamp_truncate - Truncate timespec to a granularity * @t: Timespec @@ -2632,39 +2751,159 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) EXPORT_SYMBOL(timestamp_truncate); /** - * current_time - Return FS time - * @inode: inode. + * inode_set_ctime_current - set the ctime to current_time + * @inode: inode * - * Return the current time truncated to the time granularity supported by - * the fs. + * Set the inode's ctime to the current value for the inode. Returns the + * current value that was assigned. If this is not a multigrain inode, then we + * set it to the later of the coarse time and floor value. + * + * If it is multigrain, then we first see if the coarse-grained timestamp is + * distinct from what is already there. If so, then use that. Otherwise, get a + * fine-grained timestamp. * - * Note that inode and inode->sb cannot be NULL. - * Otherwise, the function warns and returns time without truncation. + * After that, try to swap the new value into i_ctime_nsec. Accept the + * resulting ctime, regardless of the outcome of the swap. If it has + * already been replaced, then that timestamp is later than the earlier + * unacceptable one, and is thus acceptable. */ -struct timespec64 current_time(struct inode *inode) +struct timespec64 inode_set_ctime_current(struct inode *inode) { struct timespec64 now; + u32 cns, cur; - ktime_get_coarse_real_ts64(&now); - return timestamp_truncate(now, inode); + ktime_get_coarse_real_ts64_mg(&now); + now = timestamp_truncate(now, inode); + + /* Just return that if this is not a multigrain fs */ + if (!is_mgtime(inode)) { + inode_set_ctime_to_ts(inode, now); + goto out; + } + + /* + * A fine-grained time is only needed if someone has queried + * for timestamps, and the current coarse grained time isn't + * later than what's already there. + */ + cns = smp_load_acquire(&inode->i_ctime_nsec); + if (cns & I_CTIME_QUERIED) { + struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec, + .tv_nsec = cns & ~I_CTIME_QUERIED }; + + if (timespec64_compare(&now, &ctime) <= 0) { + ktime_get_real_ts64_mg(&now); + now = timestamp_truncate(now, inode); + mgtime_counter_inc(mg_fine_stamps); + } + } + mgtime_counter_inc(mg_ctime_updates); + + /* No need to cmpxchg if it's exactly the same */ + if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) { + trace_ctime_xchg_skip(inode, &now); + goto out; + } + cur = cns; +retry: + /* Try to swap the nsec value into place. */ + if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) { + /* If swap occurred, then we're (mostly) done */ + inode->i_ctime_sec = now.tv_sec; + trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur); + mgtime_counter_inc(mg_ctime_swaps); + } else { + /* + * Was the change due to someone marking the old ctime QUERIED? + * If so then retry the swap. This can only happen once since + * the only way to clear I_CTIME_QUERIED is to stamp the inode + * with a new ctime. + */ + if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) { + cns = cur; + goto retry; + } + /* Otherwise, keep the existing ctime */ + now.tv_sec = inode->i_ctime_sec; + now.tv_nsec = cur & ~I_CTIME_QUERIED; + } +out: + return now; } -EXPORT_SYMBOL(current_time); +EXPORT_SYMBOL(inode_set_ctime_current); /** - * inode_set_ctime_current - set the ctime to current_time - * @inode: inode + * inode_set_ctime_deleg - try to update the ctime on a delegated inode + * @inode: inode to update + * @update: timespec64 to set the ctime * - * Set the inode->i_ctime to the current value for the inode. Returns - * the current value that was assigned to i_ctime. + * Attempt to atomically update the ctime on behalf of a delegation holder. + * + * The nfs server can call back the holder of a delegation to get updated + * inode attributes, including the mtime. When updating the mtime, update + * the ctime to a value at least equal to that. + * + * This can race with concurrent updates to the inode, in which + * case the update is skipped. + * + * Note that this works even when multigrain timestamps are not enabled, + * so it is used in either case. */ -struct timespec64 inode_set_ctime_current(struct inode *inode) +struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update) { - struct timespec64 now = current_time(inode); + struct timespec64 now, cur_ts; + u32 cur, old; - inode_set_ctime_to_ts(inode, now); - return now; + /* pairs with try_cmpxchg below */ + cur = smp_load_acquire(&inode->i_ctime_nsec); + cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; + cur_ts.tv_sec = inode->i_ctime_sec; + + /* If the update is older than the existing value, skip it. */ + if (timespec64_compare(&update, &cur_ts) <= 0) + return cur_ts; + + ktime_get_coarse_real_ts64_mg(&now); + + /* Clamp the update to "now" if it's in the future */ + if (timespec64_compare(&update, &now) > 0) + update = now; + + update = timestamp_truncate(update, inode); + + /* No need to update if the values are already the same */ + if (timespec64_equal(&update, &cur_ts)) + return cur_ts; + + /* + * Try to swap the nsec value into place. If it fails, that means + * it raced with an update due to a write or similar activity. That + * stamp takes precedence, so just skip the update. + */ +retry: + old = cur; + if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) { + inode->i_ctime_sec = update.tv_sec; + mgtime_counter_inc(mg_ctime_swaps); + return update; + } + + /* + * Was the change due to another task marking the old ctime QUERIED? + * + * If so, then retry the swap. This can only happen once since + * the only way to clear I_CTIME_QUERIED is to stamp the inode + * with a new ctime. + */ + if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED))) + goto retry; + + /* Otherwise, it was a new timestamp. */ + cur_ts.tv_sec = inode->i_ctime_sec; + cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; + return cur_ts; } -EXPORT_SYMBOL(inode_set_ctime_current); +EXPORT_SYMBOL(inode_set_ctime_deleg); /** * in_group_or_capable - check whether caller is CAP_FSETID privileged @@ -2672,7 +2911,7 @@ EXPORT_SYMBOL(inode_set_ctime_current); * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * - * Check wether @vfsgid is in the caller's group list or if the caller is + * Check whether @vfsgid is in the caller's group list or if the caller is * privileged with CAP_FSETID over @inode. This can be used to determine * whether the setgid bit can be kept or must be dropped. * diff --git a/fs/internal.h b/fs/internal.h index 8c1b7acbbe8f..e7f02ae1e098 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -246,7 +246,6 @@ int open_namespace(struct ns_common *ns); * fs/stat.c: */ -int getname_statx_lookup_flags(int flags); int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer); int do_statx_fd(int fd, unsigned int flags, unsigned int mask, @@ -267,7 +266,7 @@ struct xattr_name { char name[XATTR_NAME_MAX + 1]; }; -struct xattr_ctx { +struct kernel_xattr_ctx { /* Value of attribute */ union { const void __user *cvalue; @@ -280,14 +279,15 @@ struct xattr_ctx { unsigned int flags; }; +ssize_t file_getxattr(struct file *file, struct kernel_xattr_ctx *ctx); +ssize_t filename_getxattr(int dfd, struct filename *filename, + unsigned int lookup_flags, struct kernel_xattr_ctx *ctx); +int file_setxattr(struct file *file, struct kernel_xattr_ctx *ctx); +int filename_setxattr(int dfd, struct filename *filename, + unsigned int lookup_flags, struct kernel_xattr_ctx *ctx); +int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx); +int import_xattr_name(struct xattr_name *kname, const char __user *name); -ssize_t do_getxattr(struct mnt_idmap *idmap, - struct dentry *d, - struct xattr_ctx *ctx); - -int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); -int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct xattr_ctx *ctx); int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL diff --git a/fs/ioctl.c b/fs/ioctl.c index 6e0c954388d4..638a36be31c1 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -231,11 +231,11 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { - struct fd src_file = fdget(srcfd); + CLASS(fd, src_file)(srcfd); loff_t cloned; int ret; - if (!fd_file(src_file)) + if (fd_empty(src_file)) return -EBADF; cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff, olen, 0); @@ -245,7 +245,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ret = -EINVAL; else ret = 0; - fdput(src_file); return ret; } @@ -892,22 +891,20 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int error; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = security_file_ioctl(fd_file(f), cmd, arg); if (error) - goto out; + return error; error = do_vfs_ioctl(fd_file(f), fd, cmd, arg); if (error == -ENOIOCTLCMD) error = vfs_ioctl(fd_file(f), cmd, arg); -out: - fdput(f); return error; } @@ -950,15 +947,15 @@ EXPORT_SYMBOL(compat_ptr_ioctl); COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int error; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = security_file_ioctl_compat(fd_file(f), cmd, arg); if (error) - goto out; + return error; switch (cmd) { /* FICLONE takes an int argument, so don't use compat_ptr() */ @@ -1009,10 +1006,6 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, error = -ENOTTY; break; } - - out: - fdput(f); - return error; } #endif diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 78ebd265f425..ce73d2a48c1e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1145,10 +1145,36 @@ static void iomap_write_delalloc_scan(struct inode *inode, } /* + * When a short write occurs, the filesystem might need to use ->iomap_end + * to remove space reservations created in ->iomap_begin. + * + * For filesystems that use delayed allocation, there can be dirty pages over + * the delalloc extent outside the range of a short write but still within the + * delalloc extent allocated for this iomap if the write raced with page + * faults. + * * Punch out all the delalloc blocks in the range given except for those that * have dirty data still pending in the page cache - those are going to be * written and so must still retain the delalloc backing for writeback. * + * The punch() callback *must* only punch delalloc extents in the range passed + * to it. It must skip over all other types of extents in the range and leave + * them completely unchanged. It must do this punch atomically with respect to + * other extent modifications. + * + * The punch() callback may be called with a folio locked to prevent writeback + * extent allocation racing at the edge of the range we are currently punching. + * The locked folio may or may not cover the range being punched, so it is not + * safe for the punch() callback to lock folios itself. + * + * Lock order is: + * + * inode->i_rwsem (shared or exclusive) + * inode->i_mapping->invalidate_lock (exclusive) + * folio_lock() + * ->punch + * internal filesystem allocation lock + * * As we are scanning the page cache for data, we don't need to reimplement the * wheel - mapping_seek_hole_data() does exactly what we need to identify the * start and end of data ranges correctly even for sub-folio block sizes. This @@ -1177,7 +1203,7 @@ static void iomap_write_delalloc_scan(struct inode *inode, * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose * the code to subtle off-by-one bugs.... */ -static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, +void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, loff_t end_byte, unsigned flags, struct iomap *iomap, iomap_punch_t punch) { @@ -1185,12 +1211,13 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, loff_t scan_end_byte = min(i_size_read(inode), end_byte); /* - * Lock the mapping to avoid races with page faults re-instantiating - * folios and dirtying them via ->page_mkwrite whilst we walk the - * cache and perform delalloc extent removal. Failing to do this can - * leave dirty pages with no space reservation in the cache. + * The caller must hold invalidate_lock to avoid races with page faults + * re-instantiating folios and dirtying them via ->page_mkwrite whilst + * we walk the cache and perform delalloc extent removal. Failing to do + * this can leave dirty pages with no space reservation in the cache. */ - filemap_invalidate_lock(inode->i_mapping); + lockdep_assert_held_write(&inode->i_mapping->invalidate_lock); + while (start_byte < scan_end_byte) { loff_t data_end; @@ -1207,7 +1234,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, if (start_byte == -ENXIO || start_byte == scan_end_byte) break; if (WARN_ON_ONCE(start_byte < 0)) - goto out_unlock; + return; WARN_ON_ONCE(start_byte < punch_start_byte); WARN_ON_ONCE(start_byte > scan_end_byte); @@ -1218,7 +1245,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, scan_end_byte, SEEK_HOLE); if (WARN_ON_ONCE(data_end < 0)) - goto out_unlock; + return; /* * If we race with post-direct I/O invalidation of the page cache, @@ -1240,74 +1267,8 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, if (punch_start_byte < end_byte) punch(inode, punch_start_byte, end_byte - punch_start_byte, iomap); -out_unlock: - filemap_invalidate_unlock(inode->i_mapping); } - -/* - * When a short write occurs, the filesystem may need to remove reserved space - * that was allocated in ->iomap_begin from it's ->iomap_end method. For - * filesystems that use delayed allocation, we need to punch out delalloc - * extents from the range that are not dirty in the page cache. As the write can - * race with page faults, there can be dirty pages over the delalloc extent - * outside the range of a short write but still within the delalloc extent - * allocated for this iomap. - * - * This function uses [start_byte, end_byte) intervals (i.e. open ended) to - * simplify range iterations. - * - * The punch() callback *must* only punch delalloc extents in the range passed - * to it. It must skip over all other types of extents in the range and leave - * them completely unchanged. It must do this punch atomically with respect to - * other extent modifications. - * - * The punch() callback may be called with a folio locked to prevent writeback - * extent allocation racing at the edge of the range we are currently punching. - * The locked folio may or may not cover the range being punched, so it is not - * safe for the punch() callback to lock folios itself. - * - * Lock order is: - * - * inode->i_rwsem (shared or exclusive) - * inode->i_mapping->invalidate_lock (exclusive) - * folio_lock() - * ->punch - * internal filesystem allocation lock - */ -void iomap_file_buffered_write_punch_delalloc(struct inode *inode, - loff_t pos, loff_t length, ssize_t written, unsigned flags, - struct iomap *iomap, iomap_punch_t punch) -{ - loff_t start_byte; - loff_t end_byte; - unsigned int blocksize = i_blocksize(inode); - - if (iomap->type != IOMAP_DELALLOC) - return; - - /* If we didn't reserve the blocks, we're not allowed to punch them. */ - if (!(iomap->flags & IOMAP_F_NEW)) - return; - - /* - * start_byte refers to the first unused block after a short write. If - * nothing was written, round offset down to point at the first block in - * the range. - */ - if (unlikely(!written)) - start_byte = round_down(pos, blocksize); - else - start_byte = round_up(pos + written, blocksize); - end_byte = round_up(pos + length, blocksize); - - /* Nothing to do if we've written the entire delalloc extent */ - if (start_byte >= end_byte) - return; - - iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap, - punch); -} -EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); +EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); static loff_t iomap_unshare_iter(struct iomap_iter *iter) { @@ -1316,22 +1277,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) loff_t length = iomap_length(iter); loff_t written = 0; - /* Don't bother with blocks that are not shared to start with. */ - if (!(iomap->flags & IOMAP_F_SHARED)) - return length; - - /* - * Don't bother with delalloc reservations, holes or unwritten extents. - * - * Note that we use srcmap directly instead of iomap_iter_srcmap as - * unsharing requires providing a separate source map, and the presence - * of one is a good indicator that unsharing is needed, unlike - * IOMAP_F_SHARED which can be set for any data that goes into the COW - * fork for XFS. - */ - if (iter->srcmap.type == IOMAP_HOLE || - iter->srcmap.type == IOMAP_DELALLOC || - iter->srcmap.type == IOMAP_UNWRITTEN) + if (!iomap_want_unshare_iter(iter)) return length; do { @@ -1838,7 +1784,7 @@ new_ioend: if (ifs) atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, &folio->page, len); + wbc_account_cgroup_owner(wbc, folio, len); return 0; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f637aa0706a3..b521eb15759e 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua) + const struct iomap *iomap, bool use_fua, bool atomic) { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; @@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, opflags |= REQ_FUA; else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + if (atomic) + opflags |= REQ_ATOMIC; return opflags; } @@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; - loff_t length = iomap_length(iter); + const loff_t length = iomap_length(iter); + bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; blk_opf_t bio_opf; struct bio *bio; @@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, size_t copied = 0; size_t orig_count; + if (atomic && length != fs_block_size) + return -EINVAL; + if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; @@ -377,12 +383,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - /* - * Set the operation flags early so that bio_iov_iter_get_pages - * can set up the page vector appropriately for a ZONE_APPEND - * operation. - */ - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { @@ -415,6 +416,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; + if (WARN_ON_ONCE(atomic && n != length)) { + /* + * This bio should have covered the complete length, + * which it doesn't, so error. We may need to zero out + * the tail (complete FS block), similar to when + * bio_iov_iter_get_pages() returns an error, above. + */ + ret = -EINVAL; + bio_put(bio); + goto zero_tail; + } if (dio->flags & IOMAP_DIO_WRITE) { task_io_account_write(n); } else { @@ -598,6 +610,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC; + if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -659,7 +674,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret != -EAGAIN) { trace_iomap_dio_invalidate_fail(inode, iomi.pos, iomi.len); - ret = -ENOTBLK; + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * folio invalidation failed, maybe + * this is transient, unlock and see if + * the caller tries again. + */ + ret = -EAGAIN; + } else { + /* fall back to buffered write */ + ret = -ENOTBLK; + } } goto out_free_dio; } diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 0a991c4ce87d..4118a42cdab0 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ - { IOMAP_NOWAIT, "NOWAIT" } + { IOMAP_NOWAIT, "NOWAIT" }, \ + { IOMAP_ATOMIC, "ATOMIC" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 4305a1ac808a..9153ff3a08e7 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -662,10 +662,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) JBUFFER_TRACE(jh, "ph3: write metadata"); escape = jbd2_journal_write_metadata_buffer(commit_transaction, jh, &wbuf[bufs], blocknr); - if (escape < 0) { - jbd2_journal_abort(journal, escape); - continue; - } jbd2_file_log_bh(&io_bufs, wbuf[bufs]); /* Record the new block's tag in the current descriptor diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 97f487c3d8fc..7e49d912b091 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -318,7 +318,6 @@ static inline void jbd2_data_do_escape(char *data) * * * Return value: - * <0: Error * =0: Finished OK without escape * =1: Finished OK with escape */ @@ -386,12 +385,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, goto escape_done; spin_unlock(&jh_in->b_state_lock); - tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); - if (!tmp) { - brelse(new_bh); - free_buffer_head(new_bh); - return -ENOMEM; - } + tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL); spin_lock(&jh_in->b_state_lock); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); @@ -1518,9 +1512,10 @@ static int journal_load_superblock(journal_t *journal) * destroy journal_t structures, and to initialise and read existing * journal blocks from disk. */ -/* First: create and setup a journal_t object in memory. We initialise - * very few fields yet: that has to wait until we have created the - * journal structures from from scratch, or loaded them from disk. */ +/* The journal_init_common() function creates and fills a journal_t object + * in memory. It calls journal_load_superblock() to load the on-disk journal + * superblock and initialize the journal_t object. + */ static journal_t *journal_init_common(struct block_device *bdev, struct block_device *fs_dev, diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 667f67342c52..9192be7c19d8 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -485,6 +485,104 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, return tag->t_checksum == cpu_to_be16(csum32); } +static __always_inline int jbd2_do_replay(journal_t *journal, + struct recovery_info *info, + struct buffer_head *bh, + unsigned long *next_log_block, + unsigned int next_commit_ID) +{ + char *tagp; + int flags; + int ret = 0; + int tag_bytes = journal_tag_bytes(journal); + int descr_csum_size = 0; + unsigned long io_block; + journal_block_tag_t tag; + struct buffer_head *obh; + struct buffer_head *nbh; + + if (jbd2_journal_has_csum_v2or3(journal)) + descr_csum_size = sizeof(struct jbd2_journal_block_tail); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while (tagp - bh->b_data + tag_bytes <= + journal->j_blocksize - descr_csum_size) { + int err; + + memcpy(&tag, tagp, sizeof(tag)); + flags = be16_to_cpu(tag.t_flags); + + io_block = (*next_log_block)++; + wrap(journal, *next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but report failure at the end. */ + ret = err; + pr_err("JBD2: IO error %d recovering block %lu in log\n", + err, io_block); + } else { + unsigned long long blocknr; + + J_ASSERT(obh != NULL); + blocknr = read_tag_block(journal, &tag); + + /* If the block has been revoked, then we're all done here. */ + if (jbd2_journal_test_revoke(journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Look for block corruption */ + if (!jbd2_block_tag_csum_verify(journal, &tag, + (journal_block_tag3_t *)tagp, + obh->b_data, next_commit_ID)) { + brelse(obh); + ret = -EFSBADCRC; + pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n", + blocknr, io_block); + goto skip_write; + } + + /* Find a buffer for the new data being restored */ + nbh = __getblk(journal->j_fs_dev, blocknr, + journal->j_blocksize); + if (nbh == NULL) { + pr_err("JBD2: Out of memory during recovery.\n"); + brelse(obh); + return -ENOMEM; + } + + lock_buffer(nbh); + memcpy(nbh->b_data, obh->b_data, journal->j_blocksize); + if (flags & JBD2_FLAG_ESCAPE) { + *((__be32 *)nbh->b_data) = + cpu_to_be32(JBD2_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + set_buffer_uptodate(nbh); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + ++info->nr_replays; + unlock_buffer(nbh); + brelse(obh); + brelse(nbh); + } + +skip_write: + tagp += tag_bytes; + if (!(flags & JBD2_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JBD2_FLAG_LAST_TAG) + break; + } + + return ret; +} + static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { @@ -493,13 +591,10 @@ static int do_one_pass(journal_t *journal, int err, success = 0; journal_superblock_t * sb; journal_header_t * tmp; - struct buffer_head * bh; + struct buffer_head *bh = NULL; unsigned int sequence; int blocktype; - int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ - int descr_csum_size = 0; - int block_error = 0; bool need_check_commit_time = false; __u64 last_trans_commit_time = 0, commit_time; @@ -528,12 +623,6 @@ static int do_one_pass(journal_t *journal, */ while (1) { - int flags; - char * tagp; - journal_block_tag_t tag; - struct buffer_head * obh; - struct buffer_head * nbh; - cond_resched(); /* If we already know where to stop the log traversal, @@ -552,6 +641,8 @@ static int do_one_pass(journal_t *journal, * record. */ jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block); + brelse(bh); + bh = NULL; err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -567,20 +658,16 @@ static int do_one_pass(journal_t *journal, tmp = (journal_header_t *)bh->b_data; - if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) { - brelse(bh); + if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) break; - } blocktype = be32_to_cpu(tmp->h_blocktype); sequence = be32_to_cpu(tmp->h_sequence); jbd2_debug(3, "Found magic %d, sequence %d\n", blocktype, sequence); - if (sequence != next_commit_ID) { - brelse(bh); + if (sequence != next_commit_ID) break; - } /* OK, we have a valid descriptor block which matches * all of the sequence number checks. What are we going @@ -589,11 +676,7 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: /* Verify checksum first */ - if (jbd2_journal_has_csum_v2or3(journal)) - descr_csum_size = - sizeof(struct jbd2_journal_block_tail); - if (descr_csum_size > 0 && - !jbd2_descriptor_block_csum_verify(journal, + if (!jbd2_descriptor_block_csum_verify(journal, bh->b_data)) { /* * PASS_SCAN can see stale blocks due to lazy @@ -603,7 +686,6 @@ static int do_one_pass(journal_t *journal, pr_err("JBD2: Invalid checksum recovering block %lu in log\n", next_log_block); err = -EFSBADCRC; - brelse(bh); goto failed; } need_check_commit_time = true; @@ -619,125 +701,39 @@ static int do_one_pass(journal_t *journal, if (pass != PASS_REPLAY) { if (pass == PASS_SCAN && jbd2_has_feature_checksum(journal) && - !need_check_commit_time && !info->end_transaction) { if (calc_chksums(journal, bh, &next_log_block, - &crc32_sum)) { - put_bh(bh); + &crc32_sum)) break; - } - put_bh(bh); continue; } next_log_block += count_tags(journal, bh); wrap(journal, next_log_block); - put_bh(bh); continue; } - /* A descriptor block: we can now write all of - * the data blocks. Yay, useful work is finally - * getting done here! */ - - tagp = &bh->b_data[sizeof(journal_header_t)]; - while ((tagp - bh->b_data + tag_bytes) - <= journal->j_blocksize - descr_csum_size) { - unsigned long io_block; - - memcpy(&tag, tagp, sizeof(tag)); - flags = be16_to_cpu(tag.t_flags); - - io_block = next_log_block++; - wrap(journal, next_log_block); - err = jread(&obh, journal, io_block); - if (err) { - /* Recover what we can, but - * report failure at the end. */ - success = err; - printk(KERN_ERR - "JBD2: IO error %d recovering " - "block %lu in log\n", - err, io_block); - } else { - unsigned long long blocknr; - - J_ASSERT(obh != NULL); - blocknr = read_tag_block(journal, - &tag); - - /* If the block has been - * revoked, then we're all done - * here. */ - if (jbd2_journal_test_revoke - (journal, blocknr, - next_commit_ID)) { - brelse(obh); - ++info->nr_revoke_hits; - goto skip_write; - } - - /* Look for block corruption */ - if (!jbd2_block_tag_csum_verify( - journal, &tag, (journal_block_tag3_t *)tagp, - obh->b_data, be32_to_cpu(tmp->h_sequence))) { - brelse(obh); - success = -EFSBADCRC; - printk(KERN_ERR "JBD2: Invalid " - "checksum recovering " - "data block %llu in " - "journal block %lu\n", - blocknr, io_block); - block_error = 1; - goto skip_write; - } - - /* Find a buffer for the new - * data being restored */ - nbh = __getblk(journal->j_fs_dev, - blocknr, - journal->j_blocksize); - if (nbh == NULL) { - printk(KERN_ERR - "JBD2: Out of memory " - "during recovery.\n"); - err = -ENOMEM; - brelse(bh); - brelse(obh); - goto failed; - } - - lock_buffer(nbh); - memcpy(nbh->b_data, obh->b_data, - journal->j_blocksize); - if (flags & JBD2_FLAG_ESCAPE) { - *((__be32 *)nbh->b_data) = - cpu_to_be32(JBD2_MAGIC_NUMBER); - } - - BUFFER_TRACE(nbh, "marking dirty"); - set_buffer_uptodate(nbh); - mark_buffer_dirty(nbh); - BUFFER_TRACE(nbh, "marking uptodate"); - ++info->nr_replays; - unlock_buffer(nbh); - brelse(obh); - brelse(nbh); - } - - skip_write: - tagp += tag_bytes; - if (!(flags & JBD2_FLAG_SAME_UUID)) - tagp += 16; - - if (flags & JBD2_FLAG_LAST_TAG) - break; + /* + * A descriptor block: we can now write all of the + * data blocks. Yay, useful work is finally getting + * done here! + */ + err = jbd2_do_replay(journal, info, bh, &next_log_block, + next_commit_ID); + if (err) { + if (err == -ENOMEM) + goto failed; + success = err; } - brelse(bh); continue; case JBD2_COMMIT_BLOCK: + if (pass != PASS_SCAN) { + next_commit_ID++; + continue; + } + /* How to differentiate between interrupted commit * and journal corruption ? * @@ -782,7 +778,6 @@ static int do_one_pass(journal_t *journal, pr_err("JBD2: Invalid checksum found in transaction %u\n", next_commit_ID); err = -EFSBADCRC; - brelse(bh); goto failed; } ignore_crc_mismatch: @@ -792,7 +787,6 @@ static int do_one_pass(journal_t *journal, */ jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", next_commit_ID); - brelse(bh); goto done; } @@ -802,8 +796,7 @@ static int do_one_pass(journal_t *journal, * much to do other than move on to the next sequence * number. */ - if (pass == PASS_SCAN && - jbd2_has_feature_checksum(journal)) { + if (jbd2_has_feature_checksum(journal)) { struct commit_header *cbh = (struct commit_header *)bh->b_data; unsigned found_chksum = @@ -812,7 +805,6 @@ static int do_one_pass(journal_t *journal, if (info->end_transaction) { journal->j_failed_commit = info->end_transaction; - brelse(bh); break; } @@ -828,36 +820,33 @@ static int do_one_pass(journal_t *journal, goto chksum_error; crc32_sum = ~0; + goto chksum_ok; } - if (pass == PASS_SCAN && - !jbd2_commit_block_csum_verify(journal, - bh->b_data)) { - if (jbd2_commit_block_csum_verify_partial( - journal, - bh->b_data)) { - pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n", - next_commit_ID, next_log_block); - goto chksum_ok; - } - chksum_error: - if (commit_time < last_trans_commit_time) - goto ignore_crc_mismatch; - info->end_transaction = next_commit_ID; - info->head_block = head_block; - if (!jbd2_has_feature_async_commit(journal)) { - journal->j_failed_commit = - next_commit_ID; - brelse(bh); - break; - } + if (jbd2_commit_block_csum_verify(journal, bh->b_data)) + goto chksum_ok; + + if (jbd2_commit_block_csum_verify_partial(journal, + bh->b_data)) { + pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n", + next_commit_ID, next_log_block); + goto chksum_ok; } - if (pass == PASS_SCAN) { - chksum_ok: - last_trans_commit_time = commit_time; - head_block = next_log_block; + +chksum_error: + if (commit_time < last_trans_commit_time) + goto ignore_crc_mismatch; + info->end_transaction = next_commit_ID; + info->head_block = head_block; + + if (!jbd2_has_feature_async_commit(journal)) { + journal->j_failed_commit = next_commit_ID; + break; } - brelse(bh); + +chksum_ok: + last_trans_commit_time = commit_time; + head_block = next_log_block; next_commit_ID++; continue; @@ -876,14 +865,11 @@ static int do_one_pass(journal_t *journal, /* If we aren't in the REVOKE pass, then we can * just skip over this block. */ - if (pass != PASS_REVOKE) { - brelse(bh); + if (pass != PASS_REVOKE) continue; - } err = scan_revoke_records(journal, bh, next_commit_ID, info); - brelse(bh); if (err) goto failed; continue; @@ -891,12 +877,12 @@ static int do_one_pass(journal_t *journal, default: jbd2_debug(3, "Unrecognised magic %d, end of scan.\n", blocktype); - brelse(bh); goto done; } } done: + brelse(bh); /* * We broke out of the log scan loop: either we came to the * known end of the log or we found an unexpected block in the @@ -927,11 +913,10 @@ static int do_one_pass(journal_t *journal, success = err; } - if (block_error && success == 0) - success = -EIO; return success; failed: + brelse(bh); return err; } diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 974ecf5e0d95..3ab410059dc2 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -187,7 +187,7 @@ int dbMount(struct inode *ipbmap) } bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); - if (!bmp->db_numag || bmp->db_numag >= MAXAG) { + if (!bmp->db_numag || bmp->db_numag > MAXAG) { err = -EINVAL; goto err_release_metapage; } diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h index 33ef13a0b110..8794281f8ffd 100644 --- a/fs/jfs/jfs_filsys.h +++ b/fs/jfs/jfs_filsys.h @@ -24,6 +24,7 @@ #define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ #define JFS_ERR_CONTINUE 0x00000004 /* continue */ #define JFS_ERR_PANIC 0x00000008 /* panic */ +#define JFS_ERR_MASK (JFS_ERR_REMOUNT_RO|JFS_ERR_CONTINUE|JFS_ERR_PANIC) /* Quota support */ #define JFS_USRQUOTA 0x00000010 diff --git a/fs/jfs/super.c b/fs/jfs/super.c index e1be21ca5d6e..223d9ac59839 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -6,11 +6,11 @@ #include <linux/fs.h> #include <linux/module.h> -#include <linux/parser.h> #include <linux/completion.h> #include <linux/vfs.h> #include <linux/quotaops.h> -#include <linux/mount.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/moduleparam.h> #include <linux/kthread.h> #include <linux/posix_acl.h> @@ -210,240 +210,195 @@ enum { Opt_discard, Opt_nodiscard, Opt_discard_minblk }; -static const match_table_t tokens = { - {Opt_integrity, "integrity"}, - {Opt_nointegrity, "nointegrity"}, - {Opt_iocharset, "iocharset=%s"}, - {Opt_resize, "resize=%u"}, - {Opt_resize_nosize, "resize"}, - {Opt_errors, "errors=%s"}, - {Opt_ignore, "noquota"}, - {Opt_quota, "quota"}, - {Opt_usrquota, "usrquota"}, - {Opt_grpquota, "grpquota"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_umask, "umask=%u"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_discard_minblk, "discard=%u"}, - {Opt_err, NULL} +static const struct constant_table jfs_param_errors[] = { + {"continue", JFS_ERR_CONTINUE}, + {"remount-ro", JFS_ERR_REMOUNT_RO}, + {"panic", JFS_ERR_PANIC}, + {} }; -static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, - int *flag) -{ - void *nls_map = (void *)-1; /* -1: no change; NULL: none */ - char *p; - struct jfs_sb_info *sbi = JFS_SBI(sb); +static const struct fs_parameter_spec jfs_param_spec[] = { + fsparam_flag_no ("integrity", Opt_integrity), + fsparam_string ("iocharset", Opt_iocharset), + fsparam_u64 ("resize", Opt_resize), + fsparam_flag ("resize", Opt_resize_nosize), + fsparam_enum ("errors", Opt_errors, jfs_param_errors), + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("noquota", Opt_ignore), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_u32oct ("umask", Opt_umask), + fsparam_flag ("discard", Opt_discard), + fsparam_u32 ("discard", Opt_discard_minblk), + fsparam_flag ("nodiscard", Opt_nodiscard), + {} +}; - *newLVSize = 0; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_integrity: - *flag &= ~JFS_NOINTEGRITY; - break; - case Opt_nointegrity: - *flag |= JFS_NOINTEGRITY; - break; - case Opt_ignore: - /* Silently ignore the quota options */ - /* Don't do anything ;-) */ - break; - case Opt_iocharset: - if (nls_map && nls_map != (void *) -1) - unload_nls(nls_map); - if (!strcmp(args[0].from, "none")) - nls_map = NULL; - else { - nls_map = load_nls(args[0].from); - if (!nls_map) { - pr_err("JFS: charset not found\n"); - goto cleanup; - } - } - break; - case Opt_resize: - { - char *resize = args[0].from; - int rc = kstrtoll(resize, 0, newLVSize); +struct jfs_context { + int flag; + kuid_t uid; + kgid_t gid; + uint umask; + uint minblks_trim; + void *nls_map; + bool resize; + s64 newLVSize; +}; - if (rc) - goto cleanup; - break; - } - case Opt_resize_nosize: - { - *newLVSize = sb_bdev_nr_blocks(sb); - if (*newLVSize == 0) - pr_err("JFS: Cannot determine volume size\n"); - break; +static int jfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct jfs_context *ctx = fc->fs_private; + int reconfigure = (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE); + struct fs_parse_result result; + struct nls_table *nls_map; + int opt; + + opt = fs_parse(fc, jfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_integrity: + if (result.negated) + ctx->flag |= JFS_NOINTEGRITY; + else + ctx->flag &= ~JFS_NOINTEGRITY; + break; + case Opt_ignore: + /* Silently ignore the quota options */ + /* Don't do anything ;-) */ + break; + case Opt_iocharset: + if (ctx->nls_map && ctx->nls_map != (void *) -1) { + unload_nls(ctx->nls_map); + ctx->nls_map = NULL; } - case Opt_errors: - { - char *errors = args[0].from; - if (!errors || !*errors) - goto cleanup; - if (!strcmp(errors, "continue")) { - *flag &= ~JFS_ERR_REMOUNT_RO; - *flag &= ~JFS_ERR_PANIC; - *flag |= JFS_ERR_CONTINUE; - } else if (!strcmp(errors, "remount-ro")) { - *flag &= ~JFS_ERR_CONTINUE; - *flag &= ~JFS_ERR_PANIC; - *flag |= JFS_ERR_REMOUNT_RO; - } else if (!strcmp(errors, "panic")) { - *flag &= ~JFS_ERR_CONTINUE; - *flag &= ~JFS_ERR_REMOUNT_RO; - *flag |= JFS_ERR_PANIC; - } else { - pr_err("JFS: %s is an invalid error handler\n", - errors); - goto cleanup; + if (!strcmp(param->string, "none")) + ctx->nls_map = NULL; + else { + nls_map = load_nls(param->string); + if (!nls_map) { + pr_err("JFS: charset not found\n"); + return -EINVAL; } - break; + ctx->nls_map = nls_map; } + break; + case Opt_resize: + if (!reconfigure) + return -EINVAL; + ctx->resize = true; + ctx->newLVSize = result.uint_64; + break; + case Opt_resize_nosize: + if (!reconfigure) + return -EINVAL; + ctx->resize = true; + break; + case Opt_errors: + ctx->flag &= ~JFS_ERR_MASK; + ctx->flag |= result.uint_32; + break; #ifdef CONFIG_QUOTA - case Opt_quota: - case Opt_usrquota: - *flag |= JFS_USRQUOTA; - break; - case Opt_grpquota: - *flag |= JFS_GRPQUOTA; - break; + case Opt_quota: + case Opt_usrquota: + ctx->flag |= JFS_USRQUOTA; + break; + case Opt_grpquota: + ctx->flag |= JFS_GRPQUOTA; + break; #else - case Opt_usrquota: - case Opt_grpquota: - case Opt_quota: - pr_err("JFS: quota operations not supported\n"); - break; + case Opt_usrquota: + case Opt_grpquota: + case Opt_quota: + pr_err("JFS: quota operations not supported\n"); + break; #endif - case Opt_uid: - { - char *uid = args[0].from; - uid_t val; - int rc = kstrtouint(uid, 0, &val); - - if (rc) - goto cleanup; - sbi->uid = make_kuid(current_user_ns(), val); - if (!uid_valid(sbi->uid)) - goto cleanup; - break; - } - - case Opt_gid: - { - char *gid = args[0].from; - gid_t val; - int rc = kstrtouint(gid, 0, &val); - - if (rc) - goto cleanup; - sbi->gid = make_kgid(current_user_ns(), val); - if (!gid_valid(sbi->gid)) - goto cleanup; - break; + case Opt_uid: + ctx->uid = result.uid; + break; + + case Opt_gid: + ctx->gid = result.gid; + break; + + case Opt_umask: + if (result.uint_32 & ~0777) { + pr_err("JFS: Invalid value of umask\n"); + return -EINVAL; } + ctx->umask = result.uint_32; + break; - case Opt_umask: - { - char *umask = args[0].from; - int rc = kstrtouint(umask, 8, &sbi->umask); + case Opt_discard: + /* if set to 1, even copying files will cause + * trimming :O + * -> user has more control over the online trimming + */ + ctx->minblks_trim = 64; + ctx->flag |= JFS_DISCARD; + break; - if (rc) - goto cleanup; - if (sbi->umask & ~0777) { - pr_err("JFS: Invalid value of umask\n"); - goto cleanup; - } - break; - } + case Opt_nodiscard: + ctx->flag &= ~JFS_DISCARD; + break; - case Opt_discard: - /* if set to 1, even copying files will cause - * trimming :O - * -> user has more control over the online trimming - */ - sbi->minblks_trim = 64; - if (bdev_max_discard_sectors(sb->s_bdev)) - *flag |= JFS_DISCARD; - else - pr_err("JFS: discard option not supported on device\n"); - break; - - case Opt_nodiscard: - *flag &= ~JFS_DISCARD; - break; - - case Opt_discard_minblk: - { - char *minblks_trim = args[0].from; - int rc; - if (bdev_max_discard_sectors(sb->s_bdev)) { - *flag |= JFS_DISCARD; - rc = kstrtouint(minblks_trim, 0, - &sbi->minblks_trim); - if (rc) - goto cleanup; - } else - pr_err("JFS: discard option not supported on device\n"); - break; - } + case Opt_discard_minblk: + ctx->minblks_trim = result.uint_32; + ctx->flag |= JFS_DISCARD; + break; - default: - printk("jfs: Unrecognized mount option \"%s\" or missing value\n", - p); - goto cleanup; - } - } - - if (nls_map != (void *) -1) { - /* Discard old (if remount) */ - unload_nls(sbi->nls_tab); - sbi->nls_tab = nls_map; + default: + return -EINVAL; } - return 1; -cleanup: - if (nls_map && nls_map != (void *) -1) - unload_nls(nls_map); return 0; } -static int jfs_remount(struct super_block *sb, int *flags, char *data) +static int jfs_reconfigure(struct fs_context *fc) { - s64 newLVSize = 0; + struct jfs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; + int readonly = fc->sb_flags & SB_RDONLY; int rc = 0; - int flag = JFS_SBI(sb)->flag; + int flag = ctx->flag; int ret; sync_filesystem(sb); - if (!parse_options(data, sb, &newLVSize, &flag)) - return -EINVAL; - if (newLVSize) { + /* Transfer results of parsing to the sbi */ + JFS_SBI(sb)->flag = ctx->flag; + JFS_SBI(sb)->uid = ctx->uid; + JFS_SBI(sb)->gid = ctx->gid; + JFS_SBI(sb)->umask = ctx->umask; + JFS_SBI(sb)->minblks_trim = ctx->minblks_trim; + if (ctx->nls_map != (void *) -1) { + unload_nls(JFS_SBI(sb)->nls_tab); + JFS_SBI(sb)->nls_tab = ctx->nls_map; + } + ctx->nls_map = NULL; + + if (ctx->resize) { if (sb_rdonly(sb)) { pr_err("JFS: resize requires volume to be mounted read-write\n"); return -EROFS; } - rc = jfs_extendfs(sb, newLVSize, 0); + + if (!ctx->newLVSize) { + ctx->newLVSize = sb_bdev_nr_blocks(sb); + if (ctx->newLVSize == 0) + pr_err("JFS: Cannot determine volume size\n"); + } + + rc = jfs_extendfs(sb, ctx->newLVSize, 0); if (rc) return rc; } - if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { + if (sb_rdonly(sb) && !readonly) { /* * Invalidate any previously read metadata. fsck may have * changed the on-disk data since we mounted r/o @@ -459,7 +414,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) dquot_resume(sb, -1); return ret; } - if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { + if (!sb_rdonly(sb) && readonly) { rc = dquot_suspend(sb, -1); if (rc < 0) return rc; @@ -467,7 +422,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) JFS_SBI(sb)->flag = flag; return rc; } - if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) + if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) { if (!sb_rdonly(sb)) { rc = jfs_umount_rw(sb); if (rc) @@ -477,18 +432,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) ret = jfs_mount_rw(sb, 1); return ret; } + } JFS_SBI(sb)->flag = flag; return 0; } -static int jfs_fill_super(struct super_block *sb, void *data, int silent) +static int jfs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct jfs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct jfs_sb_info *sbi; struct inode *inode; int rc; - s64 newLVSize = 0; - int flag, ret = -EINVAL; + int ret = -EINVAL; jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags); @@ -501,24 +458,34 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_min = 0; sb->s_time_max = U32_MAX; sbi->sb = sb; - sbi->uid = INVALID_UID; - sbi->gid = INVALID_GID; - sbi->umask = -1; - - /* initialize the mount flag and determine the default error handler */ - flag = JFS_ERR_REMOUNT_RO; - if (!parse_options((char *) data, sb, &newLVSize, &flag)) - goto out_kfree; - sbi->flag = flag; + /* Transfer results of parsing to the sbi */ + sbi->flag = ctx->flag; + sbi->uid = ctx->uid; + sbi->gid = ctx->gid; + sbi->umask = ctx->umask; + if (ctx->nls_map != (void *) -1) { + unload_nls(sbi->nls_tab); + sbi->nls_tab = ctx->nls_map; + } + ctx->nls_map = NULL; + + if (sbi->flag & JFS_DISCARD) { + if (!bdev_max_discard_sectors(sb->s_bdev)) { + pr_err("JFS: discard option not supported on device\n"); + sbi->flag &= ~JFS_DISCARD; + } else { + sbi->minblks_trim = ctx->minblks_trim; + } + } #ifdef CONFIG_JFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif - if (newLVSize) { + if (ctx->resize) { pr_err("resize option for remount only\n"); - goto out_kfree; + goto out_unload; } /* @@ -608,7 +575,6 @@ out_mount_failed: sbi->direct_inode = NULL; out_unload: unload_nls(sbi->nls_tab); -out_kfree: kfree(sbi); return ret; } @@ -664,10 +630,9 @@ out: return rc; } -static struct dentry *jfs_do_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int jfs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super); + return get_tree_bdev(fc, jfs_fill_super); } static int jfs_sync_fs(struct super_block *sb, int wait) @@ -886,7 +851,6 @@ static const struct super_operations jfs_super_operations = { .freeze_fs = jfs_freeze, .unfreeze_fs = jfs_unfreeze, .statfs = jfs_statfs, - .remount_fs = jfs_remount, .show_options = jfs_show_options, #ifdef CONFIG_QUOTA .quota_read = jfs_quota_read, @@ -902,12 +866,71 @@ static const struct export_operations jfs_export_operations = { .get_parent = jfs_get_parent, }; +static void jfs_init_options(struct fs_context *fc, struct jfs_context *ctx) +{ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + + /* Copy over current option values and mount flags */ + ctx->uid = JFS_SBI(sb)->uid; + ctx->gid = JFS_SBI(sb)->gid; + ctx->umask = JFS_SBI(sb)->umask; + ctx->nls_map = (void *)-1; + ctx->minblks_trim = JFS_SBI(sb)->minblks_trim; + ctx->flag = JFS_SBI(sb)->flag; + + } else { + /* + * Initialize the mount flag and determine the default + * error handler + */ + ctx->flag = JFS_ERR_REMOUNT_RO; + ctx->uid = INVALID_UID; + ctx->gid = INVALID_GID; + ctx->umask = -1; + ctx->nls_map = (void *)-1; + } +} + +static void jfs_free_fc(struct fs_context *fc) +{ + struct jfs_context *ctx = fc->fs_private; + + if (ctx->nls_map != (void *) -1) + unload_nls(ctx->nls_map); + kfree(ctx); +} + +static const struct fs_context_operations jfs_context_ops = { + .parse_param = jfs_parse_param, + .get_tree = jfs_get_tree, + .reconfigure = jfs_reconfigure, + .free = jfs_free_fc, +}; + +static int jfs_init_fs_context(struct fs_context *fc) +{ + struct jfs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + jfs_init_options(fc, ctx); + + fc->fs_private = ctx; + fc->ops = &jfs_context_ops; + + return 0; +} + static struct file_system_type jfs_fs_type = { .owner = THIS_MODULE, .name = "jfs", - .mount = jfs_do_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = jfs_init_fs_context, + .parameters = jfs_param_spec, }; MODULE_ALIAS_FS("jfs"); diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 9ff37ae650ea..de32c95d823d 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -175,15 +175,11 @@ ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf, size_t buf_size, size_t *file_size, enum kernel_read_file_id id) { - struct fd f = fdget(fd); - ssize_t ret = -EBADF; + CLASS(fd, f)(fd); - if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ)) - goto out; + if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) + return -EBADF; - ret = kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id); -out: - fdput(f); - return ret; + return kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id); } EXPORT_SYMBOL_GPL(kernel_read_file_from_fd); diff --git a/fs/libfs.c b/fs/libfs.c index 46966fd8bcf9..748ac5923154 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -77,6 +77,10 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) d_set_d_op(dentry, &simple_dentry_operations); + + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + return NULL; + d_add(dentry, NULL); return NULL; } @@ -1711,15 +1715,6 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENOENT); } -static int empty_dir_getattr(struct mnt_idmap *idmap, - const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) -{ - struct inode *inode = d_inode(path->dentry); - generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); - return 0; -} - static int empty_dir_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { @@ -1733,9 +1728,7 @@ static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t siz static const struct inode_operations empty_dir_inode_operations = { .lookup = empty_dir_lookup, - .permission = generic_permission, .setattr = empty_dir_setattr, - .getattr = empty_dir_getattr, .listxattr = empty_dir_listxattr, }; @@ -1791,8 +1784,8 @@ bool is_empty_dir_inode(struct inode *inode) * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ -static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) +int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) { const struct dentry *parent; const struct inode *dir; @@ -1835,6 +1828,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } +EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems @@ -1843,7 +1837,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ -static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; @@ -1858,6 +1852,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) return -EINVAL; return 0; } +EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 1f2149db10f2..2359347c9fbd 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -30,7 +30,6 @@ #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/nlm.h> #include <linux/lockd/lockd.h> -#include <linux/exportfs.h> #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -481,7 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie, int reclaim) { - struct inode *inode = nlmsvc_file_inode(file); + struct inode *inode __maybe_unused = nlmsvc_file_inode(file); struct nlm_block *block = NULL; int error; int mode; @@ -496,7 +495,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) { + if (!locks_can_async_lock(nlmsvc_file_file(file)->f_op)) { async_block = wait; wait = 0; } @@ -550,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, * requests on the underlaying ->lock() implementation but * only one nlm_block to being granted by lm_grant(). */ - if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) && + if (locks_can_async_lock(nlmsvc_file_file(file)->f_op) && !list_empty(&block->b_list)) { spin_unlock(&nlm_blocked_lock); ret = nlm_lck_blocked; diff --git a/fs/locks.c b/fs/locks.c index 204847628f3e..25afc8d9c9d1 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2136,7 +2136,6 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { int can_sleep, error, type; struct file_lock fl; - struct fd f; /* * LOCK_MAND locks were broken for a long time in that they never @@ -2155,19 +2154,18 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (type < 0) return type; - error = -EBADF; - f = fdget(fd); - if (!fd_file(f)) - return error; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE))) - goto out_putf; + return -EBADF; flock_make_lock(fd_file(f), &fl, type); error = security_file_lock(fd_file(f), fl.c.flc_type); if (error) - goto out_putf; + return error; can_sleep = !(cmd & LOCK_NB); if (can_sleep) @@ -2181,9 +2179,6 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) error = locks_lock_file_wait(fd_file(f), &fl); locks_release_private(&fl); - out_putf: - fdput(f); - return error; } diff --git a/fs/mpage.c b/fs/mpage.c index b5b5ddf9d513..82aecf372743 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -606,7 +606,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); + wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit_write(bio); diff --git a/fs/namei.c b/fs/namei.c index 4a4a22a08ac2..9d30c7aa9aa6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -211,22 +211,38 @@ getname_flags(const char __user *filename, int flags) return result; } -struct filename * -getname_uflags(const char __user *filename, int uflags) +struct filename *getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return getname_flags(filename, flags); } -struct filename * -getname(const char __user * filename) +struct filename *getname(const char __user * filename) { return getname_flags(filename, 0); } -struct filename * -getname_kernel(const char * filename) +struct filename *__getname_maybe_null(const char __user *pathname) +{ + struct filename *name; + char c; + + /* try to save on allocations; loss on um, though */ + if (get_user(c, pathname)) + return ERR_PTR(-EFAULT); + if (!c) + return NULL; + + name = getname_flags(pathname, LOOKUP_EMPTY); + if (!IS_ERR(name) && !(name->name[0])) { + putname(name); + name = NULL; + } + return name; +} + +struct filename *getname_kernel(const char * filename) { struct filename *result; int len = strlen(filename) + 1; @@ -264,7 +280,7 @@ EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { - if (IS_ERR(name)) + if (IS_ERR_OR_NULL(name)) return; if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) @@ -326,6 +342,25 @@ static int check_acl(struct mnt_idmap *idmap, return -EAGAIN; } +/* + * Very quick optimistic "we know we have no ACL's" check. + * + * Note that this is purely for ACL_TYPE_ACCESS, and purely + * for the "we have cached that there are no ACLs" case. + * + * If this returns true, we know there are no ACLs. But if + * it returns false, we might still not have ACLs (it could + * be the is_uncached_acl() case). + */ +static inline bool no_acl_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_POSIX_ACL + return likely(!READ_ONCE(inode->i_acl)); +#else + return true; +#endif +} + /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from @@ -348,6 +383,28 @@ static int acl_permission_check(struct mnt_idmap *idmap, unsigned int mode = inode->i_mode; vfsuid_t vfsuid; + /* + * Common cheap case: everybody has the requested + * rights, and there are no ACLs to check. No need + * to do any owner/group checks in that case. + * + * - 'mask&7' is the requested permission bit set + * - multiplying by 0111 spreads them out to all of ugo + * - '& ~mode' looks for missing inode permission bits + * - the '!' is for "no missing permissions" + * + * After that, we just need to check that there are no + * ACL's on the inode - do the 'IS_POSIXACL()' check last + * because it will dereference the ->i_sb pointer and we + * want to avoid that if at all possible. + */ + if (!((mask & 7) * 0111 & ~mode)) { + if (no_acl_inode(inode)) + return 0; + if (!IS_POSIXACL(inode)) + return 0; + } + /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { @@ -588,6 +645,7 @@ struct nameidata { unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; + const char *pathname; struct nameidata *saved; unsigned root_seq; int dfd; @@ -606,6 +664,7 @@ static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) p->depth = 0; p->dfd = dfd; p->name = name; + p->pathname = likely(name) ? name->name : ""; p->path.mnt = NULL; p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; @@ -2439,7 +2498,7 @@ OK: static const char *path_init(struct nameidata *nd, unsigned flags) { int error; - const char *s = nd->name->name; + const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) @@ -2503,26 +2562,22 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } } else { /* Caller must check execute permissions on the starting path component */ - struct fd f = fdget_raw(nd->dfd); + CLASS(fd_raw, f)(nd->dfd); struct dentry *dentry; - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); if (flags & LOOKUP_LINKAT_EMPTY) { if (fd_file(f)->f_cred != current_cred() && - !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) { - fdput(f); + !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) return ERR_PTR(-ENOENT); - } } dentry = fd_file(f)->f_path.dentry; - if (*s && unlikely(!d_can_lookup(dentry))) { - fdput(f); + if (*s && unlikely(!d_can_lookup(dentry))) return ERR_PTR(-ENOTDIR); - } nd->path = fd_file(f)->f_path; if (flags & LOOKUP_RCU) { @@ -2532,7 +2587,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } - fdput(f); } /* For scoped-lookups we need to set the root to the dirfd as well. */ diff --git a/fs/namespace.c b/fs/namespace.c index 93c377816d75..6b0a17487d0f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3901,7 +3901,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } new_ns->ns.ops = &mntns_operations; if (!anon) - new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); + new_ns->seq = atomic64_inc_return(&mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; @@ -3944,7 +3944,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); - free_mnt_ns(new_ns); + ns_free_inum(&new_ns->ns); + dec_mnt_namespaces(new_ns->ucounts); + mnt_ns_release(new_ns); return ERR_CAST(new); } if (user_ns != ns->user_ns) { @@ -4105,7 +4107,6 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, struct file *file; struct path newmount; struct mount *mnt; - struct fd f; unsigned int mnt_flags = 0; long ret; @@ -4133,19 +4134,18 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, return -EINVAL; } - f = fdget(fs_fd); - if (!fd_file(f)) + CLASS(fd, f)(fs_fd); + if (fd_empty(f)) return -EBADF; - ret = -EINVAL; if (fd_file(f)->f_op != &fscontext_fops) - goto err_fsfd; + return -EINVAL; fc = fd_file(f)->private_data; ret = mutex_lock_interruptible(&fc->uapi_mutex); if (ret < 0) - goto err_fsfd; + return ret; /* There must be a valid superblock or we can't mount it */ ret = -EINVAL; @@ -4212,8 +4212,6 @@ err_path: path_put(&newmount); err_unlock: mutex_unlock(&fc->uapi_mutex); -err_fsfd: - fdput(f); return ret; } @@ -4668,10 +4666,8 @@ out: static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr, unsigned int flags) { - int err = 0; struct ns_common *ns; struct user_namespace *mnt_userns; - struct fd f; if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; @@ -4687,20 +4683,16 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, if (attr->userns_fd > INT_MAX) return -EINVAL; - f = fdget(attr->userns_fd); - if (!fd_file(f)) + CLASS(fd, f)(attr->userns_fd); + if (fd_empty(f)) return -EBADF; - if (!proc_ns_file(fd_file(f))) { - err = -EINVAL; - goto out_fput; - } + if (!proc_ns_file(fd_file(f))) + return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWUSER) { - err = -EINVAL; - goto out_fput; - } + if (ns->ops->type != CLONE_NEWUSER) + return -EINVAL; /* * The initial idmapping cannot be used to create an idmapped @@ -4711,22 +4703,15 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, * result. */ mnt_userns = container_of(ns, struct user_namespace, ns); - if (mnt_userns == &init_user_ns) { - err = -EPERM; - goto out_fput; - } + if (mnt_userns == &init_user_ns) + return -EPERM; /* We're not controlling the target namespace. */ - if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) { - err = -EPERM; - goto out_fput; - } + if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) + return -EPERM; kattr->mnt_userns = get_user_ns(mnt_userns); - -out_fput: - fdput(f); - return err; + return 0; } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, @@ -5004,6 +4989,40 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) return 0; } +static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + + if (sb->s_subtype) + seq_puts(seq, sb->s_subtype); +} + +static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + struct mount *r = real_mount(s->mnt); + + if (sb->s_op->show_devname) { + size_t start = seq->count; + int ret; + + ret = sb->s_op->show_devname(seq, s->mnt->mnt_root); + if (ret) + return ret; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + /* Unescape the result */ + seq->buf[seq->count] = '\0'; + seq->count = start; + seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); + } else if (r->mnt_devname) { + seq_puts(seq, r->mnt_devname); + } + return 0; +} + static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; @@ -5038,35 +5057,134 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) return 0; } +static inline int statmount_opt_unescape(struct seq_file *seq, char *buf_start) +{ + char *buf_end, *opt_start, *opt_end; + int count = 0; + + buf_end = seq->buf + seq->count; + *buf_end = '\0'; + for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) { + opt_end = strchrnul(opt_start, ','); + *opt_end = '\0'; + buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1; + if (WARN_ON_ONCE(++count == INT_MAX)) + return -EOVERFLOW; + } + seq->count = buf_start - 1 - seq->buf; + return count; +} + +static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; + char *buf_start; + int err; + + if (!sb->s_op->show_options) + return 0; + + buf_start = seq->buf + start; + err = sb->s_op->show_options(seq, mnt->mnt_root); + if (err) + return err; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (seq->count == start) + return 0; + + err = statmount_opt_unescape(seq, buf_start); + if (err < 0) + return err; + + s->sm.opt_num = err; + return 0; +} + +static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; + char *buf_start; + int err; + + buf_start = seq->buf + start; + + err = security_sb_show_options(seq, sb); + if (!err) + return err; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (seq->count == start) + return 0; + + err = statmount_opt_unescape(seq, buf_start); + if (err < 0) + return err; + + s->sm.opt_sec_num = err; + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { - int ret; + int ret = 0; size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; + u32 start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: - sm->fs_type = seq->count; + sm->fs_type = start; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: - sm->mnt_root = seq->count; + sm->mnt_root = start; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: - sm->mnt_point = seq->count; + sm->mnt_point = start; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: - sm->mnt_opts = seq->count; + sm->mnt_opts = start; ret = statmount_mnt_opts(s, seq); break; + case STATMOUNT_OPT_ARRAY: + sm->opt_array = start; + ret = statmount_opt_array(s, seq); + break; + case STATMOUNT_OPT_SEC_ARRAY: + sm->opt_sec_array = start; + ret = statmount_opt_sec_array(s, seq); + break; + case STATMOUNT_FS_SUBTYPE: + sm->fs_subtype = start; + statmount_fs_subtype(s, seq); + break; + case STATMOUNT_SB_SOURCE: + sm->sb_source = start; + ret = statmount_sb_source(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; } + /* + * If nothing was emitted, return to avoid setting the flag + * and terminating the buffer. + */ + if (seq->count == start) + return ret; if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) return -EOVERFLOW; if (kbufsize >= s->bufsize) @@ -5201,6 +5319,18 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_MNT_OPTS) err = statmount_string(s, STATMOUNT_MNT_OPTS); + if (!err && s->mask & STATMOUNT_OPT_ARRAY) + err = statmount_string(s, STATMOUNT_OPT_ARRAY); + + if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY) + err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY); + + if (!err && s->mask & STATMOUNT_FS_SUBTYPE) + err = statmount_string(s, STATMOUNT_FS_SUBTYPE); + + if (!err && s->mask & STATMOUNT_SB_SOURCE) + err = statmount_string(s, STATMOUNT_SB_SOURCE); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); @@ -5222,7 +5352,9 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) } #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ - STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS) + STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ + STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ + STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index c40e226053cc..7ac34550c403 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -67,7 +67,8 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in * Decant the list of folios to read into a rolling buffer. */ static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq, - struct folio_queue *folioq) + struct folio_queue *folioq, + struct folio_batch *put_batch) { unsigned int order, nr; size_t size = 0; @@ -82,6 +83,9 @@ static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq, order = folio_order(folio); folioq->orders[i] = order; size += PAGE_SIZE << order; + + if (!folio_batch_add(put_batch, folio)) + folio_batch_release(put_batch); } for (int i = nr; i < folioq_nr_slots(folioq); i++) @@ -120,6 +124,9 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) * that we will need to release later - but we don't want to do * that until after we've started the I/O. */ + struct folio_batch put_batch; + + folio_batch_init(&put_batch); while (rreq->submitted < subreq->start + rsize) { struct folio_queue *tail = rreq->buffer_tail, *new; size_t added; @@ -132,10 +139,11 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) new->prev = tail; tail->next = new; rreq->buffer_tail = new; - added = netfs_load_buffer_from_ra(rreq, new); + added = netfs_load_buffer_from_ra(rreq, new, &put_batch); rreq->iter.count += added; rreq->submitted += added; } + folio_batch_release(&put_batch); } subreq->len = rsize; @@ -348,6 +356,7 @@ static int netfs_wait_for_read(struct netfs_io_request *rreq) static int netfs_prime_buffer(struct netfs_io_request *rreq) { struct folio_queue *folioq; + struct folio_batch put_batch; size_t added; folioq = kmalloc(sizeof(*folioq), GFP_KERNEL); @@ -360,39 +369,14 @@ static int netfs_prime_buffer(struct netfs_io_request *rreq) rreq->submitted = rreq->start; iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0); - added = netfs_load_buffer_from_ra(rreq, folioq); + folio_batch_init(&put_batch); + added = netfs_load_buffer_from_ra(rreq, folioq, &put_batch); + folio_batch_release(&put_batch); rreq->iter.count += added; rreq->submitted += added; return 0; } -/* - * Drop the ref on each folio that we inherited from the VM readahead code. We - * still have the folio locks to pin the page until we complete the I/O. - * - * Note that we can't just release the batch in each queue struct as we use the - * occupancy count in other places. - */ -static void netfs_put_ra_refs(struct folio_queue *folioq) -{ - struct folio_batch fbatch; - - folio_batch_init(&fbatch); - while (folioq) { - for (unsigned int slot = 0; slot < folioq_count(folioq); slot++) { - struct folio *folio = folioq_folio(folioq, slot); - if (!folio) - continue; - trace_netfs_folio(folio, netfs_folio_trace_read_put); - if (!folio_batch_add(&fbatch, folio)) - folio_batch_release(&fbatch); - } - folioq = folioq->next; - } - - folio_batch_release(&fbatch); -} - /** * netfs_readahead - Helper to manage a read request * @ractl: The description of the readahead request @@ -436,9 +420,6 @@ void netfs_readahead(struct readahead_control *ractl) goto cleanup_free; netfs_read_to_pagecache(rreq); - /* Release the folio refs whilst we're waiting for the I/O. */ - netfs_put_ra_refs(rreq->buffer); - netfs_put_request(rreq, true, netfs_rreq_trace_put_return); return; @@ -646,7 +627,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, if (unlikely(always_fill)) { if (pos - offset + len <= i_size) return false; /* Page entirely before EOF */ - zero_user_segment(&folio->page, 0, plen); + folio_zero_segment(folio, 0, plen); folio_mark_uptodate(folio); return true; } @@ -665,7 +646,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, return false; zero_out: - zero_user_segments(&folio->page, 0, offset, offset + len, plen); + folio_zero_segments(folio, 0, offset, offset + len, plen); return true; } @@ -732,7 +713,7 @@ retry: if (folio_test_uptodate(folio)) goto have_folio; - /* If the page is beyond the EOF, we want to clear it - unless it's + /* If the folio is beyond the EOF, we want to clear it - unless it's * within the cache granule containing the EOF, in which case we need * to preload the granule. */ @@ -792,7 +773,7 @@ error: EXPORT_SYMBOL(netfs_write_begin); /* - * Preload the data into a page we're proposing to write into. + * Preload the data into a folio we're proposing to write into. */ int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index b3910dfcb56d..b4826360a411 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -83,13 +83,13 @@ static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, * netfs_perform_write - Copy data into the pagecache. * @iocb: The operation parameters * @iter: The source buffer - * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * @netfs_group: Grouping for dirty folios (eg. ceph snaps). * - * Copy data into pagecache pages attached to the inode specified by @iocb. + * Copy data into pagecache folios attached to the inode specified by @iocb. * The caller must hold appropriate inode locks. * - * Dirty pages are tagged with a netfs_folio struct if they're not up to date - * to indicate the range modified. Dirty pages may also be tagged with a + * Dirty folios are tagged with a netfs_folio struct if they're not up to date + * to indicate the range modified. Dirty folios may also be tagged with a * netfs-specific grouping such that data from an old group gets flushed before * a new one is started. */ @@ -223,11 +223,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, * we try to read it. */ if (fpos >= ctx->zero_point) { - zero_user_segment(&folio->page, 0, offset); + folio_zero_segment(folio, 0, offset); copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) goto copy_failed; - zero_user_segment(&folio->page, offset + copied, flen); + folio_zero_segment(folio, offset + copied, flen); __netfs_set_group(folio, netfs_group); folio_mark_uptodate(folio); trace_netfs_folio(folio, netfs_modify_and_clear); @@ -407,7 +407,7 @@ EXPORT_SYMBOL(netfs_perform_write); * netfs_buffered_write_iter_locked - write data to a file * @iocb: IO state structure (file, offset, etc.) * @from: iov_iter with data to write - * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * @netfs_group: Grouping for dirty folios (eg. ceph snaps). * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates @@ -491,7 +491,9 @@ EXPORT_SYMBOL(netfs_file_write_iter); /* * Notification that a previously read-only page is about to become writable. - * Note that the caller indicates a single page of a multipage folio. + * The caller indicates the precise page that needs to be written to, but + * we only track group on a per-folio basis, so we block more often than + * we might otherwise. */ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) { @@ -501,7 +503,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr struct address_space *mapping = file->f_mapping; struct inode *inode = file_inode(file); struct netfs_inode *ictx = netfs_inode(inode); - vm_fault_t ret = VM_FAULT_RETRY; + vm_fault_t ret = VM_FAULT_NOPAGE; int err; _enter("%lx", folio->index); @@ -510,21 +512,15 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr if (folio_lock_killable(folio) < 0) goto out; - if (folio->mapping != mapping) { - folio_unlock(folio); - ret = VM_FAULT_NOPAGE; - goto out; - } - - if (folio_wait_writeback_killable(folio)) { - ret = VM_FAULT_LOCKED; - goto out; - } + if (folio->mapping != mapping) + goto unlock; + if (folio_wait_writeback_killable(folio) < 0) + goto unlock; /* Can we see a streaming write here? */ if (WARN_ON(!folio_test_uptodate(folio))) { - ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; - goto out; + ret = VM_FAULT_SIGBUS; + goto unlock; } group = netfs_folio_group(folio); @@ -559,5 +555,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr out: sb_end_pagefault(inode->i_sb); return ret; +unlock: + folio_unlock(folio); + goto out; } EXPORT_SYMBOL(netfs_page_mkwrite); diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c index cb75c07b5281..ced14ac78cc1 100644 --- a/fs/netfs/fscache_volume.c +++ b/fs/netfs/fscache_volume.c @@ -322,8 +322,7 @@ maybe_wait: } return; no_wait: - clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); - wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); + clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags); } /* diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c index 21eab56ee2f9..2249ecd09d0a 100644 --- a/fs/netfs/locking.c +++ b/fs/netfs/locking.c @@ -109,6 +109,7 @@ int netfs_start_io_write(struct inode *inode) up_write(&inode->i_rwsem); return -ERESTARTSYS; } + downgrade_write(&inode->i_rwsem); return 0; } EXPORT_SYMBOL(netfs_start_io_write); @@ -123,7 +124,7 @@ EXPORT_SYMBOL(netfs_start_io_write); void netfs_end_io_write(struct inode *inode) __releases(inode->i_rwsem) { - up_write(&inode->i_rwsem); + up_read(&inode->i_rwsem); } EXPORT_SYMBOL(netfs_end_io_write); diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index b18c65ba5580..3cbb289535a8 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -77,6 +77,8 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, folio_unlock(folio); } } + + folioq_clear(folioq, slot); } /* diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 114282398716..03ecc7765615 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -181,8 +181,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); ktime_get_real_ts64(&clp->cl_nfssvc_boot); - clp->cl_uuid.net = NULL; - clp->cl_uuid.dom = NULL; + nfs_uuid_init(&clp->cl_uuid); spin_lock_init(&clp->cl_localio_lock); #endif /* CONFIG_NFS_LOCALIO */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 542c7d97b235..596f35170137 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -205,12 +205,15 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) nfs_fscache_invalidate(inode, 0); flags &= ~NFS_INO_REVAL_FORCED; - nfsi->cache_validity |= flags; + flags |= nfsi->cache_validity; + if (inode->i_mapping->nrpages == 0) + flags &= ~NFS_INO_INVALID_DATA; - if (inode->i_mapping->nrpages == 0) { - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; - nfs_ooo_clear(nfsi); - } else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { + /* pairs with nfs_clear_invalid_mapping()'s smp_load_acquire() */ + smp_store_release(&nfsi->cache_validity, flags); + + if (inode->i_mapping->nrpages == 0 || + nfsi->cache_validity & NFS_INO_INVALID_DATA) { nfs_ooo_clear(nfsi); } trace_nfs_set_cache_invalid(inode, 0); @@ -628,23 +631,35 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) } } +static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) +{ + enum file_time_flags time_flags = 0; + unsigned int cache_flags = 0; + + if (ia_valid & ATTR_MTIME) { + time_flags |= S_MTIME | S_CTIME; + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (ia_valid & ATTR_ATIME) { + time_flags |= S_ATIME; + cache_flags |= NFS_INO_INVALID_ATIME; + } + inode_update_timestamps(inode, time_flags); + NFS_I(inode)->cache_validity &= ~cache_flags; +} + void nfs_update_delegated_atime(struct inode *inode) { spin_lock(&inode->i_lock); - if (nfs_have_delegated_atime(inode)) { - inode_update_timestamps(inode, S_ATIME); - NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ATIME; - } + if (nfs_have_delegated_atime(inode)) + nfs_update_timestamps(inode, ATTR_ATIME); spin_unlock(&inode->i_lock); } void nfs_update_delegated_mtime_locked(struct inode *inode) { - if (nfs_have_delegated_mtime(inode)) { - inode_update_timestamps(inode, S_CTIME | S_MTIME); - NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_CTIME | - NFS_INO_INVALID_MTIME); - } + if (nfs_have_delegated_mtime(inode)) + nfs_update_timestamps(inode, ATTR_MTIME); } void nfs_update_delegated_mtime(struct inode *inode) @@ -682,15 +697,16 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, attr->ia_valid &= ~ATTR_SIZE; } - if (nfs_have_delegated_mtime(inode)) { - if (attr->ia_valid & ATTR_MTIME) { - nfs_update_delegated_mtime(inode); - attr->ia_valid &= ~ATTR_MTIME; - } - if (attr->ia_valid & ATTR_ATIME) { - nfs_update_delegated_atime(inode); - attr->ia_valid &= ~ATTR_ATIME; - } + if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { + spin_lock(&inode->i_lock); + nfs_update_timestamps(inode, attr->ia_valid); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME); + } else if (nfs_have_delegated_atime(inode) && + attr->ia_valid & ATTR_ATIME && + !(attr->ia_valid & ATTR_MTIME)) { + nfs_update_delegated_atime(inode); + attr->ia_valid &= ~ATTR_ATIME; } /* Optimization: if the end result is no change, don't RPC */ @@ -1408,6 +1424,13 @@ int nfs_clear_invalid_mapping(struct address_space *mapping) TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; + smp_rmb(); /* pairs with smp_wmb() below */ + if (test_bit(NFS_INO_INVALIDATING, bitlock)) + continue; + /* pairs with nfs_set_cache_invalid()'s smp_store_release() */ + if (!(smp_load_acquire(&nfsi->cache_validity) & NFS_INO_INVALID_DATA)) + goto out; + /* Slow-path that double-checks with spinlock held */ spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock)) { spin_unlock(&inode->i_lock); @@ -1633,6 +1656,7 @@ void nfs_fattr_init(struct nfs_fattr *fattr) fattr->gencount = nfs_inc_attr_generation_counter(); fattr->owner_name = NULL; fattr->group_name = NULL; + fattr->mdsthreshold = NULL; } EXPORT_SYMBOL_GPL(nfs_fattr_init); diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index d0aa680ec816..8f0ce82a677e 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -205,7 +205,8 @@ void nfs_local_probe(struct nfs_client *clp) nfs_local_disable(clp); } - nfs_uuid_begin(&clp->cl_uuid); + if (!nfs_uuid_begin(&clp->cl_uuid)) + return; if (nfs_server_uuid_is_local(clp)) nfs_local_enable(clp); nfs_uuid_end(&clp->cl_uuid); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cd2fbde2e6d7..9d40319e063d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3452,6 +3452,10 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred, adjust_flags |= NFS_INO_INVALID_MODE; if (sattr->ia_valid & (ATTR_UID | ATTR_GID)) adjust_flags |= NFS_INO_INVALID_OTHER; + if (sattr->ia_valid & ATTR_ATIME) + adjust_flags |= NFS_INO_INVALID_ATIME; + if (sattr->ia_valid & ATTR_MTIME) + adjust_flags |= NFS_INO_INVALID_MTIME; do { nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9723b6c53397..ae5c5e39afa0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -885,7 +885,15 @@ static int nfs_request_mount(struct fs_context *fc, * Now ask the mount server to map our export path * to a file handle. */ - status = nfs_mount(&request, ctx->timeo, ctx->retrans); + if ((request.protocol == XPRT_TRANSPORT_UDP) == + !(ctx->flags & NFS_MOUNT_TCP)) + /* + * NFS protocol and mount protocol are both UDP or neither UDP + * so timeouts are compatible. Use NFS timeouts for MOUNT + */ + status = nfs_mount(&request, ctx->timeo, ctx->retrans); + else + status = nfs_mount(&request, NFS_UNSPEC_TIMEO, NFS_UNSPEC_RETRANS); if (status != 0) { dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", request.hostname, status); diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c index 5c8ce5066c16..09404d142d1a 100644 --- a/fs/nfs_common/nfslocalio.c +++ b/fs/nfs_common/nfslocalio.c @@ -5,7 +5,7 @@ */ #include <linux/module.h> -#include <linux/rculist.h> +#include <linux/list.h> #include <linux/nfslocalio.h> #include <net/netns/generic.h> @@ -20,15 +20,27 @@ static DEFINE_SPINLOCK(nfs_uuid_lock); */ static LIST_HEAD(nfs_uuids); -void nfs_uuid_begin(nfs_uuid_t *nfs_uuid) +void nfs_uuid_init(nfs_uuid_t *nfs_uuid) { nfs_uuid->net = NULL; nfs_uuid->dom = NULL; - uuid_gen(&nfs_uuid->uuid); + INIT_LIST_HEAD(&nfs_uuid->list); +} +EXPORT_SYMBOL_GPL(nfs_uuid_init); +bool nfs_uuid_begin(nfs_uuid_t *nfs_uuid) +{ spin_lock(&nfs_uuid_lock); - list_add_tail_rcu(&nfs_uuid->list, &nfs_uuids); + /* Is this nfs_uuid already in use? */ + if (!list_empty(&nfs_uuid->list)) { + spin_unlock(&nfs_uuid_lock); + return false; + } + uuid_gen(&nfs_uuid->uuid); + list_add_tail(&nfs_uuid->list, &nfs_uuids); spin_unlock(&nfs_uuid_lock); + + return true; } EXPORT_SYMBOL_GPL(nfs_uuid_begin); @@ -36,7 +48,8 @@ void nfs_uuid_end(nfs_uuid_t *nfs_uuid) { if (nfs_uuid->net == NULL) { spin_lock(&nfs_uuid_lock); - list_del_init(&nfs_uuid->list); + if (nfs_uuid->net == NULL) + list_del_init(&nfs_uuid->list); spin_unlock(&nfs_uuid_lock); } } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b5a6bf4f459f..d32f2dfd148f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1841,14 +1841,12 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!async_copy) goto out_err; async_copy->cp_nn = nn; + INIT_LIST_HEAD(&async_copy->copies); + refcount_set(&async_copy->refcount, 1); /* Arbitrary cap on number of pending async copy operations */ if (atomic_inc_return(&nn->pending_async_copies) > - (int)rqstp->rq_pool->sp_nrthreads) { - atomic_dec(&nn->pending_async_copies); + (int)rqstp->rq_pool->sp_nrthreads) goto out_err; - } - INIT_LIST_HEAD(&async_copy->copies); - refcount_set(&async_copy->refcount, 1); async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); if (!async_copy->cp_src) goto out_err; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 56b261608af4..d80406f8b568 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1359,21 +1359,47 @@ static void destroy_delegation(struct nfs4_delegation *dp) destroy_unhashed_deleg(dp); } +/** + * revoke_delegation - perform nfs4 delegation structure cleanup + * @dp: pointer to the delegation + * + * This function assumes that it's called either from the administrative + * interface (nfsd4_revoke_states()) that's revoking a specific delegation + * stateid or it's called from a laundromat thread (nfsd4_landromat()) that + * determined that this specific state has expired and needs to be revoked + * (both mark state with the appropriate stid sc_status mode). It is also + * assumed that a reference was taken on the @dp state. + * + * If this function finds that the @dp state is SC_STATUS_FREED it means + * that a FREE_STATEID operation for this stateid has been processed and + * we can proceed to removing it from recalled list. However, if @dp state + * isn't marked SC_STATUS_FREED, it means we need place it on the cl_revoked + * list and wait for the FREE_STATEID to arrive from the client. At the same + * time, we need to mark it as SC_STATUS_FREEABLE to indicate to the + * nfsd4_free_stateid() function that this stateid has already been added + * to the cl_revoked list and that nfsd4_free_stateid() is now responsible + * for removing it from the list. Inspection of where the delegation state + * in the revocation process is protected by the clp->cl_lock. + */ static void revoke_delegation(struct nfs4_delegation *dp) { struct nfs4_client *clp = dp->dl_stid.sc_client; WARN_ON(!list_empty(&dp->dl_recall_lru)); + WARN_ON_ONCE(!(dp->dl_stid.sc_status & + (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED))); trace_nfsd_stid_revoke(&dp->dl_stid); - if (dp->dl_stid.sc_status & - (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) { - spin_lock(&clp->cl_lock); - refcount_inc(&dp->dl_stid.sc_count); - list_add(&dp->dl_recall_lru, &clp->cl_revoked); - spin_unlock(&clp->cl_lock); + spin_lock(&clp->cl_lock); + if (dp->dl_stid.sc_status & SC_STATUS_FREED) { + list_del_init(&dp->dl_recall_lru); + goto out; } + list_add(&dp->dl_recall_lru, &clp->cl_revoked); + dp->dl_stid.sc_status |= SC_STATUS_FREEABLE; +out: + spin_unlock(&clp->cl_lock); destroy_unhashed_deleg(dp); } @@ -1780,6 +1806,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) mutex_unlock(&stp->st_mutex); break; case SC_TYPE_DELEG: + refcount_inc(&stid->sc_count); dp = delegstateid(stid); spin_lock(&state_lock); if (!unhash_delegation_locked( @@ -6545,6 +6572,7 @@ nfs4_laundromat(struct nfsd_net *nn) dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (!state_expired(<, dp->dl_time)) break; + refcount_inc(&dp->dl_stid.sc_count); unhash_delegation_locked(dp, SC_STATUS_REVOKED); list_add(&dp->dl_recall_lru, &reaplist); } @@ -7157,7 +7185,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, s->sc_status |= SC_STATUS_CLOSED; spin_unlock(&s->sc_lock); dp = delegstateid(s); - list_del_init(&dp->dl_recall_lru); + if (s->sc_status & SC_STATUS_FREEABLE) + list_del_init(&dp->dl_recall_lru); + s->sc_status |= SC_STATUS_FREED; spin_unlock(&cl->cl_lock); nfs4_put_stid(s); ret = nfs_ok; @@ -7487,7 +7517,9 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; - status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn); + status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, + SC_STATUS_REVOKED | SC_STATUS_FREEABLE, + &s, nn); if (status) goto out; dp = delegstateid(s); @@ -7969,9 +8001,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate) || - exportfs_lock_op_is_async(sb->s_export_op)) - flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); @@ -7982,9 +8011,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate) || - exportfs_lock_op_is_async(sb->s_export_op)) - flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); @@ -8004,15 +8030,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - /* - * Most filesystems with their own ->lock operations will block - * the nfsd thread waiting to acquire the lock. That leads to - * deadlocks (we don't want every nfsd thread tied up waiting - * for file locks), so don't attempt blocking lock notifications - * on those filesystems: - */ - if (!exportfs_lock_op_is_async(sb->s_export_op)) - flags &= ~FL_SLEEP; + if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) && + nfsd4_has_session(cstate) && + locks_can_async_lock(nf->nf_file->f_op)) + flags |= FL_SLEEP; nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { @@ -8684,7 +8705,7 @@ nfs4_state_shutdown_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); shrinker_free(nn->nfsd_client_shrinker); - cancel_work(&nn->nfsd_shrinker_work); + cancel_work_sync(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 79c743c01a47..35b3564c065f 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -114,6 +114,8 @@ struct nfs4_stid { /* For a deleg stateid kept around only to process free_stateid's: */ #define SC_STATUS_REVOKED BIT(1) #define SC_STATUS_ADMIN_REVOKED BIT(2) +#define SC_STATUS_FREEABLE BIT(3) +#define SC_STATUS_FREED BIT(4) unsigned short sc_status; struct list_head sc_cp_list; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 22325b590e17..d6d4f2a0e898 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -903,11 +903,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, goto out; } - if (may_flags & NFSD_MAY_64BIT_COOKIE) - file->f_mode |= FMODE_64BITHASH; - else - file->f_mode |= FMODE_32BITHASH; - *filp = file; out: return host_err; @@ -2174,13 +2169,15 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, loff_t offset = *offsetp; int may_flags = NFSD_MAY_READ; - if (fhp->fh_64bit_cookies) - may_flags |= NFSD_MAY_64BIT_COOKIE; - err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file); if (err) goto out; + if (fhp->fh_64bit_cookies) + file->f_mode |= FMODE_64BITHASH; + else + file->f_mode |= FMODE_32BITHASH; + offset = vfs_llseek(file, offset, SEEK_SET); if (offset < 0) { err = nfserrno((int)offset); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 57b4af5ad646..501ad7be5174 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -68,7 +68,6 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) goto failed; } memset(bh->b_data, 0, i_blocksize(inode)); - bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); set_buffer_uptodate(bh); @@ -133,7 +132,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, goto found; } set_buffer_mapped(bh); - bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index fe5b1a30c509..a8602729586a 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -289,7 +289,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) * The folio is mapped and unlocked. When the caller is finished with * the entry, it should call folio_release_kmap(). * - * On failure, returns NULL and the caller should ignore foliop. + * On failure, returns an error pointer and the caller should ignore foliop. */ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, const struct qstr *qstr, struct folio **foliop) @@ -312,22 +312,24 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, do { char *kaddr = nilfs_get_folio(dir, n, foliop); - if (!IS_ERR(kaddr)) { - de = (struct nilfs_dir_entry *)kaddr; - kaddr += nilfs_last_byte(dir, n) - reclen; - while ((char *) de <= kaddr) { - if (de->rec_len == 0) { - nilfs_error(dir->i_sb, - "zero-length directory entry"); - folio_release_kmap(*foliop, kaddr); - goto out; - } - if (nilfs_match(namelen, name, de)) - goto found; - de = nilfs_next_entry(de); + if (IS_ERR(kaddr)) + return ERR_CAST(kaddr); + + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(dir, n) - reclen; + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, + "zero-length directory entry"); + folio_release_kmap(*foliop, kaddr); + goto out; } - folio_release_kmap(*foliop, kaddr); + if (nilfs_match(namelen, name, de)) + goto found; + de = nilfs_next_entry(de); } + folio_release_kmap(*foliop, kaddr); + if (++n >= npages) n = 0; /* next folio is past the blocks we've got */ @@ -340,7 +342,7 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, } } while (n != start); out: - return NULL; + return ERR_PTR(-ENOENT); found: ei->i_dir_start_lookup = n; @@ -384,18 +386,18 @@ fail: return NULL; } -ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) +int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino) { - ino_t res = 0; struct nilfs_dir_entry *de; struct folio *folio; de = nilfs_find_entry(dir, qstr, &folio); - if (de) { - res = le64_to_cpu(de->inode); - folio_release_kmap(folio, de); - } - return res; + if (IS_ERR(de)) + return PTR_ERR(de); + + *ino = le64_to_cpu(de->inode); + folio_release_kmap(folio, de); + return 0; } void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 1c9ae36a03ab..ace22253fed0 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -83,10 +83,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, goto out; } - if (!buffer_mapped(bh)) { - bh->b_bdev = inode->i_sb->s_bdev; + if (!buffer_mapped(bh)) set_buffer_mapped(bh); - } bh->b_blocknr = pbn; bh->b_end_io = end_buffer_read_sync; get_bh(bh); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index ceb7dc0b5bad..2db6350b5ac2 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -89,7 +89,6 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, if (buffer_uptodate(bh)) goto failed_bh; - bh->b_bdev = sb->s_bdev; err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); if (likely(!err)) { get_bh(bh); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index c950139db6ef..9b108052d9f7 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -55,12 +55,20 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; ino_t ino; + int res; if (dentry->d_name.len > NILFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ino = nilfs_inode_by_name(dir, &dentry->d_name); - inode = ino ? nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino) : NULL; + res = nilfs_inode_by_name(dir, &dentry->d_name, &ino); + if (res) { + if (res != -ENOENT) + return ERR_PTR(res); + inode = NULL; + } else { + inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino); + } + return d_splice_alias(inode, dentry); } @@ -149,6 +157,9 @@ static int nilfs_symlink(struct mnt_idmap *idmap, struct inode *dir, /* slow symlink */ inode->i_op = &nilfs_symlink_inode_operations; inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_constraint(inode->i_mapping, + ~__GFP_FS)); inode->i_mapping->a_ops = &nilfs_aops; err = page_symlink(inode, symname, l); if (err) @@ -263,10 +274,11 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) struct folio *folio; int err; - err = -ENOENT; de = nilfs_find_entry(dir, &dentry->d_name, &folio); - if (!de) + if (IS_ERR(de)) { + err = PTR_ERR(de); goto out; + } inode = d_inode(dentry); err = -EIO; @@ -362,10 +374,11 @@ static int nilfs_rename(struct mnt_idmap *idmap, if (unlikely(err)) return err; - err = -ENOENT; old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_folio); - if (!old_de) + if (IS_ERR(old_de)) { + err = PTR_ERR(old_de); goto out; + } if (S_ISDIR(old_inode->i_mode)) { err = -EIO; @@ -382,10 +395,12 @@ static int nilfs_rename(struct mnt_idmap *idmap, if (dir_de && !nilfs_empty_dir(new_inode)) goto out_dir; - err = -ENOENT; - new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_folio); - if (!new_de) + new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, + &new_folio); + if (IS_ERR(new_de)) { + err = PTR_ERR(new_de); goto out_dir; + } nilfs_set_link(new_dir, new_de, new_folio, old_inode); folio_release_kmap(new_folio, new_de); nilfs_mark_inode_dirty(new_dir); @@ -440,12 +455,13 @@ out: */ static struct dentry *nilfs_get_parent(struct dentry *child) { - unsigned long ino; + ino_t ino; + int res; struct nilfs_root *root; - ino = nilfs_inode_by_name(d_inode(child), &dotdot_name); - if (!ino) - return ERR_PTR(-ENOENT); + res = nilfs_inode_by_name(d_inode(child), &dotdot_name, &ino); + if (res) + return ERR_PTR(res); root = NILFS_I(d_inode(child))->i_root; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index fb1c4c5bae7c..45d03826eaf1 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -254,7 +254,7 @@ static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags) /* dir.c */ int nilfs_add_link(struct dentry *, struct inode *); -ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); +int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino); int nilfs_make_empty(struct inode *, struct inode *); struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *, struct folio **); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 9c0b7cddeaae..9a849397c768 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -39,7 +39,6 @@ static struct buffer_head *__nilfs_get_folio_block(struct folio *folio, first_block = (unsigned long)index << (PAGE_SHIFT - blkbits); bh = get_nth_bh(bh, block - first_block); - touch_buffer(bh); wait_on_buffer(bh); return bh; } @@ -64,6 +63,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, folio_put(folio); return NULL; } + bh->b_bdev = inode->i_sb->s_bdev; return bh; } @@ -77,7 +77,8 @@ void nilfs_forget_buffer(struct buffer_head *bh) const unsigned long clear_bits = (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | - BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected)); + BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) | + BIT(BH_Delay)); lock_buffer(bh); set_mask_bits(&bh->b_state, clear_bits, 0); @@ -98,16 +99,16 @@ void nilfs_forget_buffer(struct buffer_head *bh) */ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) { - void *kaddr0, *kaddr1; + void *saddr, *daddr; unsigned long bits; - struct page *spage = sbh->b_page, *dpage = dbh->b_page; + struct folio *sfolio = sbh->b_folio, *dfolio = dbh->b_folio; struct buffer_head *bh; - kaddr0 = kmap_local_page(spage); - kaddr1 = kmap_local_page(dpage); - memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size); - kunmap_local(kaddr1); - kunmap_local(kaddr0); + saddr = kmap_local_folio(sfolio, bh_offset(sbh)); + daddr = kmap_local_folio(dfolio, bh_offset(dbh)); + memcpy(daddr, saddr, sbh->b_size); + kunmap_local(daddr); + kunmap_local(saddr); dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS; dbh->b_blocknr = sbh->b_blocknr; @@ -121,13 +122,13 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) unlock_buffer(bh); } if (bits & BIT(BH_Uptodate)) - SetPageUptodate(dpage); + folio_mark_uptodate(dfolio); else - ClearPageUptodate(dpage); + folio_clear_uptodate(dfolio); if (bits & BIT(BH_Mapped)) - SetPageMappedToDisk(dpage); + folio_set_mappedtodisk(dfolio); else - ClearPageMappedToDisk(dpage); + folio_clear_mappedtodisk(dfolio); } /** @@ -400,13 +401,15 @@ void nilfs_clear_folio_dirty(struct folio *folio) folio_clear_uptodate(folio); folio_clear_mappedtodisk(folio); + folio_clear_checked(folio); head = folio_buffers(folio); if (head) { const unsigned long clear_bits = (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | - BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected)); + BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) | + BIT(BH_Delay)); bh = head; do { diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index d5dbef7f5c95..6004dfdfdf0f 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -16,7 +16,6 @@ #include <linux/security.h> #include <linux/spinlock.h> #include <linux/slab.h> -#include <linux/fdtable.h> #include <linux/fsnotify_backend.h> static int dir_notify_enable __read_mostly = 1; @@ -347,9 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) new_fsn_mark = NULL; } - rcu_read_lock(); - f = lookup_fdget_rcu(fd); - rcu_read_unlock(); + f = fget_raw(fd); /* if (f != filp) means that we lost a race and another task/thread * actually closed the fd we are still playing with before we grabbed diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 224bccaab4cc..24c7c5df4998 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> -#include <linux/fdtable.h> #include <linux/fsnotify_backend.h> #include <linux/init.h> #include <linux/jiffies.h> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9644bc72e457..35159fa0b063 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> #include <linux/fcntl.h> -#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/anon_inodes.h> @@ -1003,22 +1002,17 @@ static int fanotify_find_path(int dfd, const char __user *filename, dfd, filename, flags); if (filename == NULL) { - struct fd f = fdget(dfd); + CLASS(fd, f)(dfd); - ret = -EBADF; - if (!fd_file(f)) - goto out; + if (fd_empty(f)) + return -EBADF; - ret = -ENOTDIR; if ((flags & FAN_MARK_ONLYDIR) && - !(S_ISDIR(file_inode(fd_file(f))->i_mode))) { - fdput(f); - goto out; - } + !(S_ISDIR(file_inode(fd_file(f))->i_mode))) + return -ENOTDIR; *path = fd_file(f)->f_path; path_get(path); - fdput(f); } else { unsigned int lookup_flags = 0; @@ -1682,7 +1676,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct inode *inode = NULL; struct vfsmount *mnt = NULL; struct fsnotify_group *group; - struct fd f; struct path path; struct fan_fsid __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; @@ -1752,14 +1745,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, umask = FANOTIFY_EVENT_FLAGS; } - f = fdget(fanotify_fd); - if (unlikely(!fd_file(f))) + CLASS(fd, f)(fanotify_fd); + if (fd_empty(f)) return -EBADF; /* verify that this is indeed an fanotify instance */ - ret = -EINVAL; if (unlikely(fd_file(f)->f_op != &fanotify_fops)) - goto fput_and_out; + return -EINVAL; group = fd_file(f)->private_data; /* @@ -1767,23 +1759,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * marks. This also includes setting up such marks by a group that * was initialized by an unprivileged user. */ - ret = -EPERM; if ((!capable(CAP_SYS_ADMIN) || FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) && mark_type != FAN_MARK_INODE) - goto fput_and_out; + return -EPERM; /* * Permission events require minimum priority FAN_CLASS_CONTENT. */ - ret = -EINVAL; if (mask & FANOTIFY_PERM_EVENTS && group->priority < FSNOTIFY_PRIO_CONTENT) - goto fput_and_out; + return -EINVAL; if (mask & FAN_FS_ERROR && mark_type != FAN_MARK_FILESYSTEM) - goto fput_and_out; + return -EINVAL; /* * Evictable is only relevant for inode marks, because only inode object @@ -1791,7 +1781,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, */ if (flags & FAN_MARK_EVICTABLE && mark_type != FAN_MARK_INODE) - goto fput_and_out; + return -EINVAL; /* * Events that do not carry enough information to report @@ -1803,7 +1793,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) - goto fput_and_out; + return -EINVAL; /* * FAN_RENAME uses special info type records to report the old and @@ -1811,23 +1801,22 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * useful and was not implemented. */ if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) - goto fput_and_out; + return -EINVAL; if (mark_cmd == FAN_MARK_FLUSH) { - ret = 0; if (mark_type == FAN_MARK_MOUNT) fsnotify_clear_vfsmount_marks_by_group(group); else if (mark_type == FAN_MARK_FILESYSTEM) fsnotify_clear_sb_marks_by_group(group); else fsnotify_clear_inode_marks_by_group(group); - goto fput_and_out; + return 0; } ret = fanotify_find_path(dfd, pathname, &path, flags, (mask & ALL_FSNOTIFY_EVENTS), obj_type); if (ret) - goto fput_and_out; + return ret; if (mark_cmd == FAN_MARK_ADD) { ret = fanotify_events_supported(group, &path, mask, flags); @@ -1906,8 +1895,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, path_put_and_out: path_put(&path); -fput_and_out: - fdput(f); return ret; } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 0794dcaf1e47..e0c48956608a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -732,7 +732,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, struct fsnotify_group *group; struct inode *inode; struct path path; - struct fd f; int ret; unsigned flags = 0; @@ -752,21 +751,17 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, if (unlikely(!(mask & ALL_INOTIFY_BITS))) return -EINVAL; - f = fdget(fd); - if (unlikely(!fd_file(f))) + CLASS(fd, f)(fd); + if (fd_empty(f)) return -EBADF; /* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */ - if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) { - ret = -EINVAL; - goto fput_and_out; - } + if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) + return -EINVAL; /* verify that this is indeed an inotify instance */ - if (unlikely(fd_file(f)->f_op != &inotify_fops)) { - ret = -EINVAL; - goto fput_and_out; - } + if (unlikely(fd_file(f)->f_op != &inotify_fops)) + return -EINVAL; if (!(mask & IN_DONT_FOLLOW)) flags |= LOOKUP_FOLLOW; @@ -776,7 +771,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, ret = inotify_find_inode(pathname, &path, flags, (mask & IN_ALL_EVENTS)); if (ret) - goto fput_and_out; + return ret; /* inode held in place by reference to path; group by fget on fd */ inode = path.dentry->d_inode; @@ -785,8 +780,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, /* create/update an inode mark */ ret = inotify_update_watch(group, inode, mask); path_put(&path); -fput_and_out: - fdput(f); return ret; } @@ -794,33 +787,26 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) { struct fsnotify_group *group; struct inotify_inode_mark *i_mark; - struct fd f; - int ret = -EINVAL; + CLASS(fd, f)(fd); - f = fdget(fd); - if (unlikely(!fd_file(f))) + if (fd_empty(f)) return -EBADF; /* verify that this is indeed an inotify instance */ if (unlikely(fd_file(f)->f_op != &inotify_fops)) - goto out; + return -EINVAL; group = fd_file(f)->private_data; i_mark = inotify_idr_find(group, wd); if (unlikely(!i_mark)) - goto out; - - ret = 0; + return -EINVAL; fsnotify_destroy_mark(&i_mark->fsn_mark, group); /* match ref taken by inotify_idr_find */ fsnotify_put_mark(&i_mark->fsn_mark); - -out: - fdput(f); - return ret; + return 0; } /* diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 4b9f45d7049e..4200a0341343 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1765,42 +1765,41 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, long fd; int sectsize; char *p = (char *)page; - struct fd f; ssize_t ret = -EINVAL; int live_threshold; if (reg->hr_bdev_file) - goto out; + return -EINVAL; /* We can't heartbeat without having had our node number * configured yet. */ if (o2nm_this_node() == O2NM_MAX_NODES) - goto out; + return -EINVAL; fd = simple_strtol(p, &p, 0); if (!p || (*p && (*p != '\n'))) - goto out; + return -EINVAL; if (fd < 0 || fd >= INT_MAX) - goto out; + return -EINVAL; - f = fdget(fd); - if (fd_file(f) == NULL) - goto out; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EINVAL; if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || reg->hr_block_bytes == 0) - goto out2; + return -EINVAL; if (!S_ISBLK(fd_file(f)->f_mapping->host->i_mode)) - goto out2; + return -EINVAL; reg->hr_bdev_file = bdev_file_open_by_dev(fd_file(f)->f_mapping->host->i_rdev, BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL); if (IS_ERR(reg->hr_bdev_file)) { ret = PTR_ERR(reg->hr_bdev_file); reg->hr_bdev_file = NULL; - goto out2; + return ret; } sectsize = bdev_logical_block_size(reg_bdev(reg)); @@ -1906,9 +1905,6 @@ out3: fput(reg->hr_bdev_file); reg->hr_bdev_file = NULL; } -out2: - fdput(f); -out: return ret; } diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 96b684763b39..b95724b767e1 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -280,5 +280,4 @@ const struct export_operations ocfs2_export_ops = { .fh_to_dentry = ocfs2_fh_to_dentry, .fh_to_parent = ocfs2_fh_to_parent, .get_parent = ocfs2_get_parent, - .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ad131a2fc58e..4fa6c840d20b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1129,9 +1129,12 @@ int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, trace_ocfs2_setattr(inode, dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, dentry->d_name.len, dentry->d_name.name, - attr->ia_valid, attr->ia_mode, - from_kuid(&init_user_ns, attr->ia_uid), - from_kgid(&init_user_ns, attr->ia_gid)); + attr->ia_valid, + attr->ia_valid & ATTR_MODE ? attr->ia_mode : 0, + attr->ia_valid & ATTR_UID ? + from_kuid(&init_user_ns, attr->ia_uid) : 0, + attr->ia_valid & ATTR_GID ? + from_kgid(&init_user_ns, attr->ia_gid) : 0); /* ensuring we don't even attempt to truncate a symlink */ if (S_ISLNK(inode->i_mode)) @@ -1784,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode, return 0; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di); + + if (byte_start > id_count || byte_start + byte_len > id_count) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + ret = ocfs2_truncate_inline(inode, di_bh, byte_start, byte_start + byte_len, 0); if (ret) { @@ -2801,6 +2812,7 @@ const struct file_operations ocfs2_fops = { .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, .remap_file_range = ocfs2_remap_file_range, + .fop_flags = FOP_ASYNC_LOCK, }; WRAP_DIR_ITER(ocfs2_readdir) // FIXME! @@ -2817,6 +2829,7 @@ const struct file_operations ocfs2_dops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, + .fop_flags = FOP_ASYNC_LOCK, }; /* diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index c4a4016d3866..b0733c08ed13 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -574,6 +574,8 @@ out_commit: ocfs2_commit_trans(osb, handle); out_free_group_bh: + if (ret < 0) + ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh); brelse(group_bh); out_unlock: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3d404624bb96..c79b4291777f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2319,6 +2319,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, struct ocfs2_blockcheck_stats *stats) { int status = -EAGAIN; + u32 blksz_bits; if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { @@ -2333,11 +2334,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, goto out; } status = -EINVAL; - if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { + /* Acceptable block sizes are 512 bytes, 1K, 2K and 4K. */ + blksz_bits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); + if (blksz_bits < 9 || blksz_bits > 12) { mlog(ML_ERROR, "found superblock with incorrect block " - "size: found %u, should be %u\n", - 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), - blksz); + "size bits: found %u, should be 9, 10, 11, or 12\n", + blksz_bits); + } else if ((1 << le32_to_cpu(blksz_bits)) != blksz) { + mlog(ML_ERROR, "found superblock with incorrect block " + "size: found %u, should be %u\n", 1 << blksz_bits, blksz); } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != OCFS2_MAJOR_REV_LEVEL || le16_to_cpu(di->id2.i_super.s_minor_rev_level) != diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index dd0a05365e79..73a6f6fd8a8e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2036,8 +2036,7 @@ static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc, rc = 0; ocfs2_xa_cleanup_value_truncate(loc, "removing", orig_clusters); - if (rc) - goto out; + goto out; } } diff --git a/fs/open.c b/fs/open.c index acaeb3e25c88..4b37c5912e6c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -187,19 +187,13 @@ long do_ftruncate(struct file *file, loff_t length, int small) long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { - struct fd f; - int error; - if (length < 0) return -EINVAL; - f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return -EBADF; - error = do_ftruncate(fd_file(f), length, small); - - fdput(f); - return error; + return do_ftruncate(fd_file(f), length, small); } SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) @@ -349,14 +343,12 @@ EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { - struct fd f = fdget(fd); - int error = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - error = vfs_fallocate(fd_file(f), mode, offset, len); - fdput(f); - } - return error; + if (fd_empty(f)) + return -EBADF; + + return vfs_fallocate(fd_file(f), mode, offset, len); } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) @@ -580,23 +572,18 @@ out: SYSCALL_DEFINE1(fchdir, unsigned int, fd) { - struct fd f = fdget_raw(fd); + CLASS(fd_raw, f)(fd); int error; - error = -EBADF; - if (!fd_file(f)) - goto out; + if (fd_empty(f)) + return -EBADF; - error = -ENOTDIR; if (!d_can_lookup(fd_file(f)->f_path.dentry)) - goto out_putf; + return -ENOTDIR; error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &fd_file(f)->f_path); -out_putf: - fdput(f); -out: return error; } @@ -671,14 +658,12 @@ int vfs_fchmod(struct file *file, umode_t mode) SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { - struct fd f = fdget(fd); - int err = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - err = vfs_fchmod(fd_file(f), mode); - fdput(f); - } - return err; + if (fd_empty(f)) + return -EBADF; + + return vfs_fchmod(fd_file(f), mode); } static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, @@ -865,14 +850,12 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group) int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { - struct fd f = fdget(fd); - int error = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - error = vfs_fchown(fd_file(f), user, group); - fdput(f); - } - return error; + if (fd_empty(f)) + return -EBADF; + + return vfs_fchown(fd_file(f), user, group); } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) @@ -1457,6 +1440,8 @@ SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, if (unlikely(usize < OPEN_HOW_SIZE_VER0)) return -EINVAL; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); if (err) @@ -1574,23 +1559,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd) return retval; } -/** - * sys_close_range() - Close all file descriptors in a given range. - * - * @fd: starting file descriptor to close - * @max_fd: last file descriptor to close - * @flags: reserved for future extensions - * - * This closes a range of file descriptors. All file descriptors - * from @fd up to and including @max_fd are closed. - * Currently, errors to close a given file descriptor are ignored. - */ -SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, - unsigned int, flags) -{ - return __close_range(fd, max_fd, flags); -} - /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 2ed6ad641a20..ee2cbd044ce6 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -16,7 +16,6 @@ #include <linux/sched/signal.h> #include <linux/cred.h> #include <linux/namei.h> -#include <linux/fdtable.h> #include <linux/ratelimit.h> #include <linux/exportfs.h> #include "overlayfs.h" diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 4504493b20be..4444c78e2e0c 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -231,6 +231,11 @@ static void ovl_file_modified(struct file *file) ovl_copyattr(file_inode(file)); } +static void ovl_file_end_write(struct file *file, loff_t pos, ssize_t ret) +{ + ovl_file_modified(file); +} + static void ovl_file_accessed(struct file *file) { struct inode *inode, *upperinode; @@ -294,7 +299,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) struct backing_file_ctx ctx = { .cred = ovl_creds(inode->i_sb), .user_file = file, - .end_write = ovl_file_modified, + .end_write = ovl_file_end_write, }; if (!iov_iter_count(iter)) @@ -364,7 +369,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, struct backing_file_ctx ctx = { .cred = ovl_creds(inode->i_sb), .user_file = out, - .end_write = ovl_file_modified, + .end_write = ovl_file_end_write, }; inode_lock(inode); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 35fd3e3e1778..8b31f44c12cd 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -170,7 +170,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_do_getattr(&realpath, stat, request_mask, flags); + err = vfs_getattr_nosec(&realpath, stat, request_mask, flags); if (err) goto out; @@ -195,8 +195,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, (!is_dir ? STATX_NLINK : 0); ovl_path_lower(dentry, &realpath); - err = ovl_do_getattr(&realpath, &lowerstat, lowermask, - flags); + err = vfs_getattr_nosec(&realpath, &lowerstat, lowermask, + flags); if (err) goto out; @@ -248,8 +248,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, ovl_path_lowerdata(dentry, &realpath); if (realpath.dentry) { - err = ovl_do_getattr(&realpath, &lowerdatastat, - lowermask, flags); + err = vfs_getattr_nosec(&realpath, &lowerdatastat, + lowermask, flags); if (err) goto out; } else { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 0bfe35da4b7b..910dbbb2bb7b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -412,14 +412,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags) return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); } -static inline int ovl_do_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) -{ - if (flags & AT_GETATTR_NOSEC) - return vfs_getattr_nosec(path, stat, request_mask, flags); - return vfs_getattr(path, stat, request_mask, flags); -} - /* util.c */ int ovl_get_write_access(struct dentry *dentry); void ovl_put_write_access(struct dentry *dentry); diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index e42546c6c5df..1115c22deca0 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -141,10 +141,10 @@ static int ovl_verity_mode_def(void) const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_string_empty("lowerdir", Opt_lowerdir), - fsparam_string("lowerdir+", Opt_lowerdir_add), - fsparam_string("datadir+", Opt_datadir_add), - fsparam_string("upperdir", Opt_upperdir), - fsparam_string("workdir", Opt_workdir), + fsparam_file_or_string("lowerdir+", Opt_lowerdir_add), + fsparam_file_or_string("datadir+", Opt_datadir_add), + fsparam_file_or_string("upperdir", Opt_upperdir), + fsparam_file_or_string("workdir", Opt_workdir), fsparam_flag("default_permissions", Opt_default_permissions), fsparam_enum("redirect_dir", Opt_redirect_dir, ovl_parameter_redirect_dir), fsparam_enum("index", Opt_index, ovl_parameter_bool), @@ -367,40 +367,100 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, } } -static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer) +static inline bool is_upper_layer(enum ovl_opt layer) +{ + return layer == Opt_upperdir || layer == Opt_workdir; +} + +/* Handle non-file descriptor-based layer options that require path lookup. */ +static inline int ovl_kern_path(const char *layer_name, struct path *layer_path, + enum ovl_opt layer) { - char *name = kstrdup(layer_name, GFP_KERNEL); - bool upper = (layer == Opt_upperdir || layer == Opt_workdir); - struct path path; int err; + switch (layer) { + case Opt_upperdir: + fallthrough; + case Opt_workdir: + fallthrough; + case Opt_lowerdir: + err = ovl_mount_dir(layer_name, layer_path); + break; + case Opt_lowerdir_add: + fallthrough; + case Opt_datadir_add: + err = ovl_mount_dir_noesc(layer_name, layer_path); + break; + default: + WARN_ON_ONCE(true); + err = -EINVAL; + } + + return err; +} + +static int ovl_do_parse_layer(struct fs_context *fc, const char *layer_name, + struct path *layer_path, enum ovl_opt layer) +{ + char *name __free(kfree) = kstrdup(layer_name, GFP_KERNEL); + bool upper; + int err = 0; + if (!name) return -ENOMEM; - if (upper || layer == Opt_lowerdir) - err = ovl_mount_dir(name, &path); - else - err = ovl_mount_dir_noesc(name, &path); + upper = is_upper_layer(layer); + err = ovl_mount_dir_check(fc, layer_path, layer, name, upper); if (err) - goto out_free; - - err = ovl_mount_dir_check(fc, &path, layer, name, upper); - if (err) - goto out_put; + return err; if (!upper) { err = ovl_ctx_realloc_lower(fc); if (err) - goto out_put; + return err; } /* Store the user provided path string in ctx to show in mountinfo */ - ovl_add_layer(fc, layer, &path, &name); + ovl_add_layer(fc, layer, layer_path, &name); + return err; +} + +static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, + enum ovl_opt layer) +{ + struct path layer_path __free(path_put) = {}; + int err = 0; + + switch (param->type) { + case fs_value_is_string: + err = ovl_kern_path(param->string, &layer_path, layer); + if (err) + return err; + err = ovl_do_parse_layer(fc, param->string, &layer_path, layer); + break; + case fs_value_is_file: { + char *buf __free(kfree); + char *layer_name; + + buf = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); + if (!buf) + return -ENOMEM; + + layer_path = param->file->f_path; + path_get(&layer_path); + + layer_name = d_path(&layer_path, buf, PATH_MAX); + if (IS_ERR(layer_name)) + return PTR_ERR(layer_name); + + err = ovl_do_parse_layer(fc, layer_name, &layer_path, layer); + break; + } + default: + WARN_ON_ONCE(true); + err = -EINVAL; + } -out_put: - path_put(&path); -out_free: - kfree(name); return err; } @@ -474,7 +534,13 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) iter = dup; for (nr = 0; nr < nr_lower; nr++) { - err = ovl_parse_layer(fc, iter, Opt_lowerdir); + struct path path __free(path_put) = {}; + + err = ovl_kern_path(iter, &path, Opt_lowerdir); + if (err) + goto out_err; + + err = ovl_do_parse_layer(fc, iter, &path, Opt_lowerdir); if (err) goto out_err; @@ -555,7 +621,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_datadir_add: case Opt_upperdir: case Opt_workdir: - err = ovl_parse_layer(fc, param->string, opt); + err = ovl_parse_layer(fc, param, opt); break; case Opt_default_permissions: config->default_permissions = true; diff --git a/fs/pidfs.c b/fs/pidfs.c index 80675b6bf884..618abb1fa1b8 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -2,6 +2,7 @@ #include <linux/anon_inodes.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/cgroup.h> #include <linux/magic.h> #include <linux/mount.h> #include <linux/pid.h> @@ -114,6 +115,81 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) return poll_flags; } +static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) +{ + struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; + size_t usize = _IOC_SIZE(cmd); + struct pidfd_info kinfo = {}; + struct user_namespace *user_ns; + const struct cred *c; + __u64 mask; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; +#endif + + if (!uinfo) + return -EINVAL; + if (usize < PIDFD_INFO_SIZE_VER0) + return -EINVAL; /* First version, no smaller struct possible */ + + if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) + return -EFAULT; + + c = get_task_cred(task); + if (!c) + return -ESRCH; + + /* Unconditionally return identifiers and credentials, the rest only on request */ + + user_ns = current_user_ns(); + kinfo.ruid = from_kuid_munged(user_ns, c->uid); + kinfo.rgid = from_kgid_munged(user_ns, c->gid); + kinfo.euid = from_kuid_munged(user_ns, c->euid); + kinfo.egid = from_kgid_munged(user_ns, c->egid); + kinfo.suid = from_kuid_munged(user_ns, c->suid); + kinfo.sgid = from_kgid_munged(user_ns, c->sgid); + kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); + kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); + kinfo.mask |= PIDFD_INFO_CREDS; + put_cred(c); + +#ifdef CONFIG_CGROUPS + rcu_read_lock(); + cgrp = task_dfl_cgroup(task); + kinfo.cgroupid = cgroup_id(cgrp); + kinfo.mask |= PIDFD_INFO_CGROUPID; + rcu_read_unlock(); +#endif + + /* + * Copy pid/tgid last, to reduce the chances the information might be + * stale. Note that it is not possible to ensure it will be valid as the + * task might return as soon as the copy_to_user finishes, but that's ok + * and userspace expects that might happen and can act accordingly, so + * this is just best-effort. What we can do however is checking that all + * the fields are set correctly, or return ESRCH to avoid providing + * incomplete information. */ + + kinfo.ppid = task_ppid_nr_ns(task, NULL); + kinfo.tgid = task_tgid_vnr(task); + kinfo.pid = task_pid_vnr(task); + kinfo.mask |= PIDFD_INFO_PID; + + if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) + return -ESRCH; + + /* + * If userspace and the kernel have the same struct size it can just + * be copied. If userspace provides an older struct, only the bits that + * userspace knows about will be copied. If userspace provides a new + * struct, only the bits that the kernel knows about will be copied. + */ + if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo)))) + return -EFAULT; + + return 0; +} + static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct task_struct *task __free(put_task) = NULL; @@ -122,13 +198,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; - if (arg) - return -EINVAL; - task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; + /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ + if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) + return pidfd_info(task, cmd, arg); + + if (arg) + return -EINVAL; + scoped_guard(task_lock, task) { nsp = task->nsproxy; if (nsp) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 6c66a37522d0..4050942ab52f 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -200,11 +200,11 @@ EXPORT_SYMBOL(posix_acl_init); * Allocate a new ACL with the specified number of entries. */ struct posix_acl * -posix_acl_alloc(int count, gfp_t flags) +posix_acl_alloc(unsigned int count, gfp_t flags) { - const size_t size = sizeof(struct posix_acl) + - count * sizeof(struct posix_acl_entry); - struct posix_acl *acl = kmalloc(size, flags); + struct posix_acl *acl; + + acl = kmalloc(struct_size(acl, a_entries, count), flags); if (acl) posix_acl_init(acl, count); return acl; @@ -220,9 +220,8 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) struct posix_acl *clone = NULL; if (acl) { - int size = sizeof(struct posix_acl) + acl->a_count * - sizeof(struct posix_acl_entry); - clone = kmemdup(acl, size, flags); + clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), + flags); if (clone) refcount_set(&clone->a_refcount, 1); } diff --git a/fs/proc/base.c b/fs/proc/base.c index b31283d81c52..e9d7ddc52f69 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -58,7 +58,6 @@ #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 1f54a54bfb91..24baf23e864f 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -77,7 +77,7 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) return single_open(file, seq_show, inode); } -/** +/* * Shared /proc/pid/fdinfo and /proc/pid/fdinfo/fd permission helper to ensure * that the current task has PTRACE_MODE_READ in addition to the normal * POSIX-like checks. @@ -116,9 +116,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) { struct file *file; - rcu_read_lock(); - file = task_lookup_fdget_rcu(task, fd); - rcu_read_unlock(); + file = fget_task(task, fd); if (file) { *mode = file->f_mode; fput(file); @@ -258,19 +256,17 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) goto out; - rcu_read_lock(); for (fd = ctx->pos - 2;; fd++) { struct file *f; struct fd_data data; char name[10 + 1]; unsigned int len; - f = task_lookup_next_fdget_rcu(p, &fd); + f = fget_task_next(p, &fd); ctx->pos = fd + 2LL; if (!f) break; data.mode = f->f_mode; - rcu_read_unlock(); fput(f); data.fd = fd; @@ -278,11 +274,9 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!proc_fill_cache(file, ctx, name, len, instantiate, p, &data)) - goto out; + break; cond_resched(); - rcu_read_lock(); } - rcu_read_unlock(); out: put_task_struct(p); return 0; diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index f4616083faef..04bb29721419 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -20,7 +20,7 @@ static int show_softirqs(struct seq_file *p, void *v) for (i = 0; i < NR_SOFTIRQS; i++) { seq_printf(p, "%12s:", softirq_to_name[i]); for_each_possible_cpu(j) - seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); + seq_put_decimal_ull_width(p, " ", kstat_softirqs_cpu(i, j), 10); seq_putc(p, '\n'); } return 0; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 72f14fd59c2d..38a5a3e9cba2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -909,8 +909,15 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) { /* * Don't forget to update Documentation/ on changes. + * + * The length of the second argument of mnemonics[] + * needs to be 3 instead of previously set 2 + * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) + * to avoid spurious + * -Werror=unterminated-string-initialization warning + * with GCC 15 */ - static const char mnemonics[BITS_PER_LONG][2] = { + static const char mnemonics[BITS_PER_LONG][3] = { /* * In case if we meet a flag we don't know about. */ @@ -971,7 +978,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ -#ifdef CONFIG_X86_USER_SHADOW_STACK +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) @@ -987,11 +994,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) for (i = 0; i < BITS_PER_LONG; i++) { if (!mnemonics[i][0]) continue; - if (vma->vm_flags & (1UL << i)) { - seq_putc(m, mnemonics[i][0]); - seq_putc(m, mnemonics[i][1]); - seq_putc(m, ' '); - } + if (vma->vm_flags & (1UL << i)) + seq_printf(m, "%s ", mnemonics[i]); } seq_putc(m, '\n'); } @@ -2661,8 +2665,10 @@ static int pagemap_scan_get_args(struct pm_scan_arg *arg, return -EFAULT; if (!arg->vec && arg->vec_len) return -EINVAL; + if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) + return -EINVAL; if (arg->vec && !access_ok((void __user *)(long)arg->vec, - arg->vec_len * sizeof(struct page_region))) + size_mul(arg->vec_len, sizeof(struct page_region)))) return -EFAULT; /* Fixup default values */ diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index b52d85f8ad59..b4521b096058 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -457,10 +457,6 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) #endif } -static const struct vm_operations_struct vmcore_mmap_ops = { - .fault = mmap_vmcore_fault, -}; - /** * vmcore_alloc_buf - allocate buffer in vmalloc memory * @size: size of buffer @@ -488,6 +484,11 @@ static inline char *vmcore_alloc_buf(size_t size) * virtually contiguous user-space in ELF layout. */ #ifdef CONFIG_MMU + +static const struct vm_operations_struct vmcore_mmap_ops = { + .fault = mmap_vmcore_fault, +}; + /* * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages * reported as not being ram with the zero page. diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 290157bc7bec..7c2b75a44485 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -976,21 +976,19 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd, struct super_block *sb; unsigned int cmds = cmd >> SUBCMDSHIFT; unsigned int type = cmd & SUBCMDMASK; - struct fd f; + CLASS(fd_raw, f)(fd); int ret; - f = fdget_raw(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - ret = -EINVAL; if (type >= MAXQUOTAS) - goto out; + return -EINVAL; if (quotactl_cmd_write(cmds)) { ret = mnt_want_write(fd_file(f)->f_path.mnt); if (ret) - goto out; + return ret; } sb = fd_file(f)->f_path.mnt->mnt_sb; @@ -1008,7 +1006,5 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd, if (quotactl_cmd_write(cmds)) mnt_drop_write(fd_file(f)->f_path.mnt); -out: - fdput(f); return ret; } diff --git a/fs/read_write.c b/fs/read_write.c index 64dc24afdb3a..a6133241dfb8 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -386,8 +386,8 @@ EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; - struct fd f = fdget_pos(fd); - if (!fd_file(f)) + CLASS(fd_pos, f)(fd); + if (fd_empty(f)) return -EBADF; retval = -EINVAL; @@ -397,7 +397,6 @@ static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } - fdput_pos(f); return retval; } @@ -420,15 +419,14 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned int, whence) { int retval; - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); loff_t offset; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - retval = -EINVAL; if (whence > SEEK_MAX) - goto out_putf; + return -EINVAL; offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low, whence); @@ -439,8 +437,6 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } -out_putf: - fdput_pos(f); return retval; } #endif @@ -700,10 +696,10 @@ static inline loff_t *file_ppos(struct file *file) ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; - if (fd_file(f)) { + if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; @@ -712,7 +708,6 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) ret = vfs_read(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; - fdput_pos(f); } return ret; } @@ -724,10 +719,10 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; - if (fd_file(f)) { + if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; @@ -736,7 +731,6 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) ret = vfs_write(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; - fdput_pos(f); } return ret; @@ -751,21 +745,17 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { - struct fd f; - ssize_t ret = -EBADF; - if (pos < 0) return -EINVAL; - f = fdget(fd); - if (fd_file(f)) { - ret = -ESPIPE; - if (fd_file(f)->f_mode & FMODE_PREAD) - ret = vfs_read(fd_file(f), buf, count, &pos); - fdput(f); - } + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; - return ret; + if (fd_file(f)->f_mode & FMODE_PREAD) + return vfs_read(fd_file(f), buf, count, &pos); + + return -ESPIPE; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, @@ -785,21 +775,17 @@ COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { - struct fd f; - ssize_t ret = -EBADF; - if (pos < 0) return -EINVAL; - f = fdget(fd); - if (fd_file(f)) { - ret = -ESPIPE; - if (fd_file(f)->f_mode & FMODE_PWRITE) - ret = vfs_write(fd_file(f), buf, count, &pos); - fdput(f); - } + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; - return ret; + if (fd_file(f)->f_mode & FMODE_PWRITE) + return vfs_write(fd_file(f), buf, count, &pos); + + return -ESPIPE; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, @@ -1075,10 +1061,10 @@ out: static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; - if (fd_file(f)) { + if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; @@ -1087,7 +1073,6 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; - fdput_pos(f); } if (ret > 0) @@ -1099,10 +1084,10 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; - if (fd_file(f)) { + if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; @@ -1111,7 +1096,6 @@ static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; - fdput_pos(f); } if (ret > 0) @@ -1129,18 +1113,16 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { - struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; - f = fdget(fd); - if (fd_file(f)) { + CLASS(fd, f)(fd); + if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PREAD) ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags); - fdput(f); } if (ret > 0) @@ -1152,18 +1134,16 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { - struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; - f = fdget(fd); - if (fd_file(f)) { + CLASS(fd, f)(fd); + if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PWRITE) ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags); - fdput(f); } if (ret > 0) @@ -1315,7 +1295,6 @@ COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { - struct fd in, out; struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; @@ -1326,35 +1305,32 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, /* * Get input file, and verify that it is ok.. */ - retval = -EBADF; - in = fdget(in_fd); - if (!fd_file(in)) - goto out; + CLASS(fd, in)(in_fd); + if (fd_empty(in)) + return -EBADF; if (!(fd_file(in)->f_mode & FMODE_READ)) - goto fput_in; - retval = -ESPIPE; + return -EBADF; if (!ppos) { pos = fd_file(in)->f_pos; } else { pos = *ppos; if (!(fd_file(in)->f_mode & FMODE_PREAD)) - goto fput_in; + return -ESPIPE; } retval = rw_verify_area(READ, fd_file(in), &pos, count); if (retval < 0) - goto fput_in; + return retval; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ - retval = -EBADF; - out = fdget(out_fd); - if (!fd_file(out)) - goto fput_in; + CLASS(fd, out)(out_fd); + if (fd_empty(out)) + return -EBADF; if (!(fd_file(out)->f_mode & FMODE_WRITE)) - goto fput_out; + return -EBADF; in_inode = file_inode(fd_file(in)); out_inode = file_inode(fd_file(out)); out_pos = fd_file(out)->f_pos; @@ -1363,9 +1339,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { - retval = -EOVERFLOW; if (pos >= max) - goto fput_out; + return -EOVERFLOW; count = max - pos; } @@ -1384,7 +1359,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!opipe) { retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count); if (retval < 0) - goto fput_out; + return retval; retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos, count, fl); } else { @@ -1410,12 +1385,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, inc_syscw(current); if (pos > max) retval = -EOVERFLOW; - -fput_out: - fdput(out); -fput_in: - fdput(in); -out: return retval; } @@ -1671,36 +1640,32 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, { loff_t pos_in; loff_t pos_out; - struct fd f_in; - struct fd f_out; ssize_t ret = -EBADF; - f_in = fdget(fd_in); - if (!fd_file(f_in)) - goto out2; + CLASS(fd, f_in)(fd_in); + if (fd_empty(f_in)) + return -EBADF; - f_out = fdget(fd_out); - if (!fd_file(f_out)) - goto out1; + CLASS(fd, f_out)(fd_out); + if (fd_empty(f_out)) + return -EBADF; - ret = -EFAULT; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) - goto out; + return -EFAULT; } else { pos_in = fd_file(f_in)->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) - goto out; + return -EFAULT; } else { pos_out = fd_file(f_out)->f_pos; } - ret = -EINVAL; if (flags != 0) - goto out; + return -EINVAL; ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len, flags); @@ -1722,12 +1687,6 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, fd_file(f_out)->f_pos = pos_out; } } - -out: - fdput(f_out); -out1: - fdput(f_in); -out2: return ret; } @@ -1830,18 +1789,22 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } -bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos) +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) - return false; + return -EINVAL; if (!is_power_of_2(len)) - return false; + return -EINVAL; - if (!IS_ALIGNED(pos, len)) - return false; + if (!IS_ALIGNED(iocb->ki_pos, len)) + return -EINVAL; - return true; + if (!(iocb->ki_flags & IOCB_DIRECT)) + return -EOPNOTSUPP; + + return 0; } +EXPORT_SYMBOL_GPL(generic_atomic_write_valid); diff --git a/fs/readdir.c b/fs/readdir.c index 6d29cab8576e..0038efda417b 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -219,20 +219,19 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct old_linux_dirent __user *, dirent, unsigned int, count) { int error; - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); struct readdir_callback buf = { .ctx.actor = fillonedir, .dirent = dirent }; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (buf.result) error = buf.result; - fdput_pos(f); return error; } @@ -309,7 +308,7 @@ efault: SYSCALL_DEFINE3(getdents, unsigned int, fd, struct linux_dirent __user *, dirent, unsigned int, count) { - struct fd f; + CLASS(fd_pos, f)(fd); struct getdents_callback buf = { .ctx.actor = filldir, .count = count, @@ -317,8 +316,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - f = fdget_pos(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); @@ -333,7 +331,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, else error = count - buf.count; } - fdput_pos(f); return error; } @@ -392,7 +389,7 @@ efault: SYSCALL_DEFINE3(getdents64, unsigned int, fd, struct linux_dirent64 __user *, dirent, unsigned int, count) { - struct fd f; + CLASS(fd_pos, f)(fd); struct getdents_callback64 buf = { .ctx.actor = filldir64, .count = count, @@ -400,8 +397,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, }; int error; - f = fdget_pos(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); @@ -417,7 +413,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, else error = count - buf.count; } - fdput_pos(f); return error; } @@ -477,20 +472,19 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct compat_old_linux_dirent __user *, dirent, unsigned int, count) { int error; - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); struct compat_readdir_callback buf = { .ctx.actor = compat_fillonedir, .dirent = dirent }; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (buf.result) error = buf.result; - fdput_pos(f); return error; } @@ -560,7 +554,7 @@ efault: COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, struct compat_linux_dirent __user *, dirent, unsigned int, count) { - struct fd f; + CLASS(fd_pos, f)(fd); struct compat_getdents_callback buf = { .ctx.actor = compat_filldir, .current_dir = dirent, @@ -568,8 +562,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - f = fdget_pos(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); @@ -584,7 +577,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, else error = count - buf.count; } - fdput_pos(f); return error; } #endif diff --git a/fs/remap_range.c b/fs/remap_range.c index 4403d5c68fcb..26afbbbfb10c 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -536,20 +536,19 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } for (i = 0, info = same->info; i < count; i++, info++) { - struct fd dst_fd = fdget(info->dest_fd); - struct file *dst_file = fd_file(dst_fd); + CLASS(fd, dst_fd)(info->dest_fd); - if (!dst_file) { + if (fd_empty(dst_fd)) { info->status = -EBADF; goto next_loop; } if (info->reserved) { info->status = -EINVAL; - goto next_fdput; + goto next_loop; } - deduped = vfs_dedupe_file_range_one(file, off, dst_file, + deduped = vfs_dedupe_file_range_one(file, off, fd_file(dst_fd), info->dest_offset, len, REMAP_FILE_CAN_SHORTEN); if (deduped == -EBADE) @@ -559,8 +558,6 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) else info->bytes_deduped = len; -next_fdput: - fdput(dst_fd); next_loop: if (fatal_signal_pending(current)) break; diff --git a/fs/select.c b/fs/select.c index a77907faf2b4..e223d1fe9d55 100644 --- a/fs/select.c +++ b/fs/select.c @@ -462,15 +462,22 @@ get_max: EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) -static inline void wait_key_set(poll_table *wait, unsigned long in, +static inline __poll_t select_poll_one(int fd, poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { + CLASS(fd, f)(fd); + + if (fd_empty(f)) + return EPOLLNVAL; + wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; + + return vfs_poll(fd_file(f), wait); } static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) @@ -522,20 +529,12 @@ static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { - struct fd f; if (i >= n) break; if (!(bit & all_bits)) continue; - mask = EPOLLNVAL; - f = fdget(i); - if (fd_file(f)) { - wait_key_set(wait, in, out, bit, - busy_flag); - mask = vfs_poll(fd_file(f), wait); - - fdput(f); - } + mask = select_poll_one(i, wait, in, out, bit, + busy_flag); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; @@ -856,15 +855,14 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, __poll_t busy_flag) { int fd = pollfd->fd; - __poll_t mask = 0, filter; - struct fd f; + __poll_t mask, filter; if (fd < 0) - goto out; - mask = EPOLLNVAL; - f = fdget(fd); - if (!fd_file(f)) - goto out; + return 0; + + CLASS(fd, f)(fd); + if (fd_empty(f)) + return EPOLLNVAL; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; @@ -872,13 +870,7 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, mask = vfs_poll(fd_file(f), pwait); if (mask & busy_flag) *can_busy_poll = true; - mask &= filter; /* Mask out unneeded events. */ - fdput(f); - -out: - /* ... and so does ->revents */ - pollfd->revents = mangle_poll(mask); - return mask; + return mask & filter; /* Mask out unneeded events. */ } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, @@ -910,6 +902,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { + __poll_t mask; /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't @@ -917,8 +910,9 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, * this. They'll get immediately deregistered * when we break out and return. */ - if (do_pollfd(pfd, pt, &can_busy_loop, - busy_flag)) { + mask = do_pollfd(pfd, pt, &can_busy_loop, busy_flag); + pfd->revents = mangle_poll(mask); + if (mask) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ diff --git a/fs/seq_file.c b/fs/seq_file.c index e676c8b0cf5d..8bbb1ad46335 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -343,8 +343,8 @@ EXPORT_SYMBOL(seq_lseek); /** * seq_release - free the structures associated with sequential file. - * @file: file in question * @inode: its inode + * @file: file in question * * Frees the structures associated with sequential file; can be used * as ->f_op->release() if you don't have private data to destroy. diff --git a/fs/signalfd.c b/fs/signalfd.c index 736bebf93591..d1a5f43ce466 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -288,20 +288,17 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) fd_install(ufd, file); } else { - struct fd f = fdget(ufd); - if (!fd_file(f)) + CLASS(fd, f)(ufd); + if (fd_empty(f)) return -EBADF; ctx = fd_file(f)->private_data; - if (fd_file(f)->f_op != &signalfd_fops) { - fdput(f); + if (fd_file(f)->f_op != &signalfd_fops) return -EINVAL; - } spin_lock_irq(¤t->sighand->siglock); ctx->sigmask = *mask; spin_unlock_irq(¤t->sighand->siglock); wake_up(¤t->sighand->signalfd_wqh); - fdput(f); } return ufd; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 000e1ef3beea..20cafdff5081 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1780,7 +1780,7 @@ static int cifs_init_netfs(void) nomem_subreqpool: kmem_cache_destroy(cifs_io_subrequest_cachep); nomem_subreq: - mempool_destroy(&cifs_io_request_pool); + mempool_exit(&cifs_io_request_pool); nomem_reqpool: kmem_cache_destroy(cifs_io_request_cachep); nomem_req: diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 68c716e6261b..1d3470bca45e 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -252,10 +252,6 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); extern ssize_t cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read); -extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, - struct page *page, - unsigned int page_offset, - unsigned int to_read); int cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter, unsigned int to_read); @@ -623,8 +619,6 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); -struct cifs_chan * -cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server); int cifs_try_adding_channels(struct cifs_ses *ses); bool is_server_using_iface(struct TCP_Server_Info *server, struct cifs_server_iface *iface); @@ -640,9 +634,6 @@ cifs_chan_set_in_reconnect(struct cifs_ses *ses, void cifs_chan_clear_in_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); -bool -cifs_chan_in_reconnect(struct cifs_ses *ses, - struct TCP_Server_Info *server); void cifs_chan_set_need_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c index 63b5a55b7a57..766b4de13da7 100644 --- a/fs/smb/client/compress.c +++ b/fs/smb/client/compress.c @@ -166,7 +166,6 @@ static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample) loff_t start = iter->xarray_start + iter->iov_offset; pgoff_t last, index = start / PAGE_SIZE; size_t len, off, foff; - ssize_t ret = 0; void *p; int s = 0; @@ -193,9 +192,6 @@ static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample) memcpy(&sample[s], p, len2); kunmap_local(p); - if (ret < 0) - return ret; - s += len2; if (len2 < SZ_2K || s >= max - SZ_2K) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index adf8758847f6..0ce2d704b1f3 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -795,18 +795,6 @@ cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) } int -cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, - unsigned int page_offset, unsigned int to_read) -{ - struct msghdr smb_msg = {}; - struct bio_vec bv; - - bvec_set_page(&bv, page, to_read, page_offset); - iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read); - return cifs_readv_from_socket(server, &smb_msg); -} - -int cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter, unsigned int to_read) { @@ -1049,6 +1037,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server) */ } + put_net(cifs_net_ns(server)); kfree(server->leaf_fullpath); kfree(server); @@ -1647,8 +1636,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) /* srv_count can never go negative */ WARN_ON(server->srv_count < 0); - put_net(cifs_net_ns(server)); - list_del_init(&server->tcp_ses_list); spin_unlock(&cifs_tcp_ses_lock); @@ -3082,13 +3069,22 @@ generic_ip_connect(struct TCP_Server_Info *server) if (server->ssocket) { socket = server->ssocket; } else { - rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM, + struct net *net = cifs_net_ns(server); + struct sock *sk; + + rc = __sock_create(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket, 1); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); return rc; } + sk = server->ssocket->sk; + __netns_tracker_free(net, &sk->ns_tracker, false); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); + /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); socket = server->ssocket; diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 28c4e576d460..5c5a52019efa 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -920,8 +920,15 @@ static int smb3_reconfigure(struct fs_context *fc) else { kfree_sensitive(ses->password); ses->password = kstrdup(ctx->password, GFP_KERNEL); + if (!ses->password) + return -ENOMEM; kfree_sensitive(ses->password2); ses->password2 = kstrdup(ctx->password2, GFP_KERNEL); + if (!ses->password2) { + kfree_sensitive(ses->password); + ses->password = NULL; + return -ENOMEM; + } } STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 2ce193609d8b..56439da4f119 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -72,7 +72,6 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file, unsigned long srcfd) { int rc; - struct fd src_file; struct inode *src_inode; cifs_dbg(FYI, "ioctl copychunk range\n"); @@ -89,8 +88,8 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file, return rc; } - src_file = fdget(srcfd); - if (!fd_file(src_file)) { + CLASS(fd, src_file)(srcfd); + if (fd_empty(src_file)) { rc = -EBADF; goto out_drop_write; } @@ -98,20 +97,18 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file, if (fd_file(src_file)->f_op->unlocked_ioctl != cifs_ioctl) { rc = -EBADF; cifs_dbg(VFS, "src file seems to be from a different filesystem type\n"); - goto out_fput; + goto out_drop_write; } src_inode = file_inode(fd_file(src_file)); rc = -EINVAL; if (S_ISDIR(src_inode->i_mode)) - goto out_fput; + goto out_drop_write; rc = cifs_file_copychunk_range(xid, fd_file(src_file), 0, dst_file, 0, src_inode->i_size, 0); if (rc > 0) rc = 0; -out_fput: - fdput(src_file); out_drop_write: mnt_drop_write_file(dst_file); return rc; diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 3216f786908f..c88e9657f47a 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -115,18 +115,6 @@ cifs_chan_clear_in_reconnect(struct cifs_ses *ses, ses->chans[chan_index].in_reconnect = false; } -bool -cifs_chan_in_reconnect(struct cifs_ses *ses, - struct TCP_Server_Info *server) -{ - unsigned int chan_index = cifs_ses_get_chan_index(ses, server); - - if (chan_index == CIFS_INVAL_CHAN_INDEX) - return true; /* err on the safer side */ - - return CIFS_CHAN_IN_RECONNECT(ses, chan_index); -} - void cifs_chan_set_need_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server) @@ -487,26 +475,6 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) spin_unlock(&ses->chan_lock); } -/* - * If server is a channel of ses, return the corresponding enclosing - * cifs_chan otherwise return NULL. - */ -struct cifs_chan * -cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server) -{ - int i; - - spin_lock(&ses->chan_lock); - for (i = 0; i < ses->chan_count; i++) { - if (ses->chans[i].server == server) { - spin_unlock(&ses->chan_lock); - return &ses->chans[i]; - } - } - spin_unlock(&ses->chan_lock); - return NULL; -} - static int cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 6b385fce3f2a..24a2aa04a108 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1158,7 +1158,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; unsigned int size[1]; void *data[1]; - struct smb2_file_full_ea_info *ea = NULL; + struct smb2_file_full_ea_info *ea; struct smb2_query_info_rsp *rsp; int rc, used_len = 0; int retries = 0, cur_sleep = 1; @@ -1179,6 +1179,7 @@ replay_again: if (!utf16_path) return -ENOMEM; + ea = NULL; resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; vars = kzalloc(sizeof(*vars), GFP_KERNEL); if (!vars) { diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index b2f16a7b696d..6584b5cddc28 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -3313,6 +3313,15 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, return rc; if (indatalen) { + unsigned int len; + + if (WARN_ON_ONCE(smb3_encryption_required(tcon) && + (check_add_overflow(total_len - 1, + ALIGN(indatalen, 8), &len) || + len > MAX_CIFS_SMALL_BUFFER_SIZE))) { + cifs_small_buf_release(req); + return -EIO; + } /* * indatalen is usually small at a couple of bytes max, so * just allocate through generic pool diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 09b20039636e..611716bc8f27 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -512,6 +512,7 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, int in_len, char *out_blob, int *out_len) { struct ksmbd_spnego_authen_response *resp; + struct ksmbd_login_response_ext *resp_ext = NULL; struct ksmbd_user *user = NULL; int retval; @@ -540,7 +541,10 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, goto out; } - user = ksmbd_alloc_user(&resp->login_response); + if (resp->login_response.status & KSMBD_USER_FLAG_EXTENSION) + resp_ext = ksmbd_ipc_login_request_ext(resp->login_response.account); + + user = ksmbd_alloc_user(&resp->login_response, resp_ext); if (!user) { ksmbd_debug(AUTH, "login failure\n"); retval = -ENOMEM; diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index aa2a37a7ce84..e6a72f75ab94 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -70,6 +70,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); atomic_set(&conn->refcnt, 1); + atomic_set(&conn->mux_smb_requests, 0); conn->total_credits = 1; conn->outstanding_credits = 0; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index b379ae4fdcdf..8ddd5a3c7baf 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -107,6 +107,7 @@ struct ksmbd_conn { __le16 signing_algorithm; bool binding; atomic_t refcnt; + atomic_t mux_smb_requests; }; struct ksmbd_conn_ops { diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index 38e6fd2da3b8..3d01d9d15293 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -51,6 +51,9 @@ * - KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST/RESPONSE(ksmbd_spnego_authen_request/response) * This event is to make kerberos authentication to be processed in * userspace. + * + * - KSMBD_EVENT_LOGIN_REQUEST_EXT/RESPONSE_EXT(ksmbd_login_request_ext/response_ext) + * This event is to get user account extension info to user IPC daemon. */ #define KSMBD_GENL_NAME "SMBD_GENL" @@ -146,6 +149,16 @@ struct ksmbd_login_response { }; /* + * IPC user login response extension. + */ +struct ksmbd_login_response_ext { + __u32 handle; + __s32 ngroups; /* supplementary group count */ + __s8 reserved[128]; /* Reserved room */ + __s8 ____payload[]; +}; + +/* * IPC request to fetch net share config. */ struct ksmbd_share_config_request { @@ -306,6 +319,9 @@ enum ksmbd_event { KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST, KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15, + KSMBD_EVENT_LOGIN_REQUEST_EXT, + KSMBD_EVENT_LOGIN_RESPONSE_EXT, + __KSMBD_EVENT_MAX, KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1 }; @@ -336,6 +352,7 @@ enum KSMBD_TREE_CONN_STATUS { #define KSMBD_USER_FLAG_BAD_USER BIT(3) #define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4) #define KSMBD_USER_FLAG_DELAY_SESSION BIT(5) +#define KSMBD_USER_FLAG_EXTENSION BIT(6) /* * Share config flags. diff --git a/fs/smb/server/mgmt/user_config.c b/fs/smb/server/mgmt/user_config.c index 279d00feff21..421a4a95e216 100644 --- a/fs/smb/server/mgmt/user_config.c +++ b/fs/smb/server/mgmt/user_config.c @@ -12,6 +12,7 @@ struct ksmbd_user *ksmbd_login_user(const char *account) { struct ksmbd_login_response *resp; + struct ksmbd_login_response_ext *resp_ext = NULL; struct ksmbd_user *user = NULL; resp = ksmbd_ipc_login_request(account); @@ -21,15 +22,19 @@ struct ksmbd_user *ksmbd_login_user(const char *account) if (!(resp->status & KSMBD_USER_FLAG_OK)) goto out; - user = ksmbd_alloc_user(resp); + if (resp->status & KSMBD_USER_FLAG_EXTENSION) + resp_ext = ksmbd_ipc_login_request_ext(account); + + user = ksmbd_alloc_user(resp, resp_ext); out: kvfree(resp); return user; } -struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp) +struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp, + struct ksmbd_login_response_ext *resp_ext) { - struct ksmbd_user *user = NULL; + struct ksmbd_user *user; user = kmalloc(sizeof(struct ksmbd_user), GFP_KERNEL); if (!user) @@ -44,18 +49,42 @@ struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp) if (user->passkey) memcpy(user->passkey, resp->hash, resp->hash_sz); - if (!user->name || !user->passkey) { - kfree(user->name); - kfree(user->passkey); - kfree(user); - user = NULL; + user->ngroups = 0; + user->sgid = NULL; + + if (!user->name || !user->passkey) + goto err_free; + + if (resp_ext) { + if (resp_ext->ngroups > NGROUPS_MAX) { + pr_err("ngroups(%u) from login response exceeds max groups(%d)\n", + resp_ext->ngroups, NGROUPS_MAX); + goto err_free; + } + + user->sgid = kmemdup(resp_ext->____payload, + resp_ext->ngroups * sizeof(gid_t), + GFP_KERNEL); + if (!user->sgid) + goto err_free; + + user->ngroups = resp_ext->ngroups; + ksmbd_debug(SMB, "supplementary groups : %d\n", user->ngroups); } + return user; + +err_free: + kfree(user->name); + kfree(user->passkey); + kfree(user); + return NULL; } void ksmbd_free_user(struct ksmbd_user *user) { ksmbd_ipc_logout_request(user->name, user->flags); + kfree(user->sgid); kfree(user->name); kfree(user->passkey); kfree(user); diff --git a/fs/smb/server/mgmt/user_config.h b/fs/smb/server/mgmt/user_config.h index e068a19fd904..8c227b8d4954 100644 --- a/fs/smb/server/mgmt/user_config.h +++ b/fs/smb/server/mgmt/user_config.h @@ -18,6 +18,8 @@ struct ksmbd_user { size_t passkey_sz; char *passkey; + int ngroups; + gid_t *sgid; }; static inline bool user_guest(struct ksmbd_user *user) @@ -60,7 +62,8 @@ static inline unsigned int user_gid(struct ksmbd_user *user) } struct ksmbd_user *ksmbd_login_user(const char *account); -struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp); +struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp, + struct ksmbd_login_response_ext *resp_ext); void ksmbd_free_user(struct ksmbd_user *user); int ksmbd_anonymous_user(struct ksmbd_user *user); bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 99416ce9f501..ad02fe555fda 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -90,7 +90,7 @@ static int __rpc_method(char *rpc_name) int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) { - struct ksmbd_session_rpc *entry; + struct ksmbd_session_rpc *entry, *old; struct ksmbd_rpc_command *resp; int method; @@ -106,16 +106,19 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) entry->id = ksmbd_ipc_id_alloc(); if (entry->id < 0) goto free_entry; - xa_store(&sess->rpc_handle_list, entry->id, entry, GFP_KERNEL); + old = xa_store(&sess->rpc_handle_list, entry->id, entry, GFP_KERNEL); + if (xa_is_err(old)) + goto free_id; resp = ksmbd_rpc_open(sess, entry->id); if (!resp) - goto free_id; + goto erase_xa; kvfree(resp); return entry->id; -free_id: +erase_xa: xa_erase(&sess->rpc_handle_list, entry->id); +free_id: ksmbd_rpc_id_free(entry->id); free_entry: kfree(entry); @@ -175,11 +178,13 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) unsigned long id; struct ksmbd_session *sess; + down_write(&sessions_table_lock); down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { - if (sess->state != SMB2_SESSION_VALID || - time_after(jiffies, - sess->last_active + SMB2_SESSION_TIMEOUT)) { + if (atomic_read(&sess->refcnt) == 0 && + (sess->state != SMB2_SESSION_VALID || + time_after(jiffies, + sess->last_active + SMB2_SESSION_TIMEOUT))) { xa_erase(&conn->sessions, sess->id); hash_del(&sess->hlist); ksmbd_session_destroy(sess); @@ -187,6 +192,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) } } up_write(&conn->session_lock); + up_write(&sessions_table_lock); } int ksmbd_session_register(struct ksmbd_conn *conn, @@ -228,7 +234,6 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) } } } - up_write(&sessions_table_lock); down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { @@ -248,6 +253,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) } } up_write(&conn->session_lock); + up_write(&sessions_table_lock); } struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, @@ -269,8 +275,6 @@ struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id) down_read(&sessions_table_lock); sess = __session_lookup(id); - if (sess) - sess->last_active = jiffies; up_read(&sessions_table_lock); return sess; @@ -289,6 +293,22 @@ struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn, return sess; } +void ksmbd_user_session_get(struct ksmbd_session *sess) +{ + atomic_inc(&sess->refcnt); +} + +void ksmbd_user_session_put(struct ksmbd_session *sess) +{ + if (!sess) + return; + + if (atomic_read(&sess->refcnt) <= 0) + WARN_ON(1); + else + atomic_dec(&sess->refcnt); +} + struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn, u64 sess_id) { @@ -393,6 +413,7 @@ static struct ksmbd_session *__session_create(int protocol) xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; rwlock_init(&sess->tree_conns_lock); + atomic_set(&sess->refcnt, 1); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index dc9fded2cd43..c1c4b20bd5c6 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -61,6 +61,8 @@ struct ksmbd_session { struct ksmbd_file_table file_table; unsigned long last_active; rwlock_t tree_conns_lock; + + atomic_t refcnt; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) @@ -104,4 +106,6 @@ void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id); int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name); void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id); int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id); +void ksmbd_user_session_get(struct ksmbd_session *sess); +void ksmbd_user_session_put(struct ksmbd_session *sess); #endif /* __USER_SESSION_MANAGEMENT_H__ */ diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 231d2d224656..e6cfedba9992 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -241,6 +241,8 @@ send: if (work->tcon) ksmbd_tree_connect_put(work->tcon); smb3_preauth_hash_rsp(work); + if (work->sess) + ksmbd_user_session_put(work->sess); if (work->sess && work->sess->enc && work->encrypted && conn->ops->encrypt_resp) { rc = conn->ops->encrypt_resp(work); @@ -268,6 +270,7 @@ static void handle_ksmbd_work(struct work_struct *wk) ksmbd_conn_try_dequeue_request(work); ksmbd_free_work_struct(work); + atomic_dec(&conn->mux_smb_requests); /* * Checking waitqueue to dropping pending requests on * disconnection. waitqueue_active is safe because it @@ -289,6 +292,15 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) struct ksmbd_work *work; int err; + err = ksmbd_init_smb_server(conn); + if (err) + return 0; + + if (atomic_inc_return(&conn->mux_smb_requests) >= conn->vals->max_credits) { + atomic_dec_return(&conn->mux_smb_requests); + return -ENOSPC; + } + work = ksmbd_alloc_work_struct(); if (!work) { pr_err("allocation for work failed\n"); @@ -299,12 +311,6 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) work->request_buf = conn->request_buf; conn->request_buf = NULL; - err = ksmbd_init_smb_server(work); - if (err) { - ksmbd_free_work_struct(work); - return 0; - } - ksmbd_conn_enqueue_request(work); atomic_inc(&conn->r_count); /* update activity on connection */ diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 797b0f24097b..599118aed205 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -605,8 +605,10 @@ int smb2_check_user_session(struct ksmbd_work *work) /* Check for validity of user session */ work->sess = ksmbd_session_lookup_all(conn, sess_id); - if (work->sess) + if (work->sess) { + ksmbd_user_session_get(work->sess); return 1; + } ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id); return -ENOENT; } @@ -1740,6 +1742,7 @@ int smb2_sess_setup(struct ksmbd_work *work) } conn->binding = true; + ksmbd_user_session_get(sess); } else if ((conn->dialect < SMB30_PROT_ID || server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) && (req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { @@ -1766,6 +1769,7 @@ int smb2_sess_setup(struct ksmbd_work *work) } conn->binding = false; + ksmbd_user_session_get(sess); } work->sess = sess; @@ -2228,7 +2232,9 @@ int smb2_session_logoff(struct ksmbd_work *work) } ksmbd_destroy_file_table(&sess->file_table); + down_write(&conn->session_lock); sess->state = SMB2_SESSION_EXPIRED; + up_write(&conn->session_lock); ksmbd_free_user(sess->user); sess->user = NULL; diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index 5b8d75e78ffb..75b4eb856d32 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -388,6 +388,10 @@ static struct smb_version_ops smb1_server_ops = { .set_rsp_status = set_smb1_rsp_status, }; +static struct smb_version_values smb1_server_values = { + .max_credits = SMB2_MAX_CREDITS, +}; + static int smb1_negotiate(struct ksmbd_work *work) { return ksmbd_smb_negotiate_common(work, SMB_COM_NEGOTIATE); @@ -399,18 +403,18 @@ static struct smb_version_cmds smb1_server_cmds[1] = { static int init_smb1_server(struct ksmbd_conn *conn) { + conn->vals = &smb1_server_values; conn->ops = &smb1_server_ops; conn->cmds = smb1_server_cmds; conn->max_cmds = ARRAY_SIZE(smb1_server_cmds); return 0; } -int ksmbd_init_smb_server(struct ksmbd_work *work) +int ksmbd_init_smb_server(struct ksmbd_conn *conn) { - struct ksmbd_conn *conn = work->conn; __le32 proto; - proto = *(__le32 *)((struct smb_hdr *)work->request_buf)->Protocol; + proto = *(__le32 *)((struct smb_hdr *)conn->request_buf)->Protocol; if (conn->need_neg == false) { if (proto == SMB1_PROTO_NUMBER) return -EINVAL; @@ -736,13 +740,15 @@ int __ksmbd_override_fsids(struct ksmbd_work *work, struct ksmbd_share_config *share) { struct ksmbd_session *sess = work->sess; + struct ksmbd_user *user = sess->user; struct cred *cred; struct group_info *gi; unsigned int uid; unsigned int gid; + int i; - uid = user_uid(sess->user); - gid = user_gid(sess->user); + uid = user_uid(user); + gid = user_gid(user); if (share->force_uid != KSMBD_SHARE_INVALID_UID) uid = share->force_uid; if (share->force_gid != KSMBD_SHARE_INVALID_GID) @@ -755,11 +761,18 @@ int __ksmbd_override_fsids(struct ksmbd_work *work, cred->fsuid = make_kuid(&init_user_ns, uid); cred->fsgid = make_kgid(&init_user_ns, gid); - gi = groups_alloc(0); + gi = groups_alloc(user->ngroups); if (!gi) { abort_creds(cred); return -ENOMEM; } + + for (i = 0; i < user->ngroups; i++) + gi->gid[i] = make_kgid(&init_user_ns, user->sgid[i]); + + if (user->ngroups) + groups_sort(gi); + set_groups(cred, gi); put_group_info(gi); diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index cc1d6dfe29d5..a3d8a905b07e 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -427,7 +427,7 @@ bool ksmbd_smb_request(struct ksmbd_conn *conn); int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count); -int ksmbd_init_smb_server(struct ksmbd_work *work); +int ksmbd_init_smb_server(struct ksmbd_conn *conn); struct ksmbd_kstat; int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 8752ac82c557..2f27afb695f6 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -120,6 +120,12 @@ static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = { }, [KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE] = { }, + [KSMBD_EVENT_LOGIN_REQUEST_EXT] = { + .len = sizeof(struct ksmbd_login_request), + }, + [KSMBD_EVENT_LOGIN_RESPONSE_EXT] = { + .len = sizeof(struct ksmbd_login_response_ext), + }, }; static struct genl_ops ksmbd_genl_ops[] = { @@ -187,6 +193,14 @@ static struct genl_ops ksmbd_genl_ops[] = { .cmd = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE, .doit = handle_generic_event, }, + { + .cmd = KSMBD_EVENT_LOGIN_REQUEST_EXT, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_LOGIN_RESPONSE_EXT, + .doit = handle_generic_event, + }, }; static struct genl_family ksmbd_genl_family = { @@ -198,7 +212,7 @@ static struct genl_family ksmbd_genl_family = { .module = THIS_MODULE, .ops = ksmbd_genl_ops, .n_ops = ARRAY_SIZE(ksmbd_genl_ops), - .resv_start_op = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE + 1, + .resv_start_op = KSMBD_EVENT_LOGIN_RESPONSE_EXT + 1, }; static void ksmbd_nl_init_fixup(void) @@ -459,16 +473,24 @@ static int ipc_validate_msg(struct ipc_msg_table_entry *entry) { unsigned int msg_sz = entry->msg_sz; - if (entry->type == KSMBD_EVENT_RPC_REQUEST) { + switch (entry->type) { + case KSMBD_EVENT_RPC_REQUEST: + { struct ksmbd_rpc_command *resp = entry->response; msg_sz = sizeof(struct ksmbd_rpc_command) + resp->payload_sz; - } else if (entry->type == KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST) { + break; + } + case KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST: + { struct ksmbd_spnego_authen_response *resp = entry->response; msg_sz = sizeof(struct ksmbd_spnego_authen_response) + resp->session_key_len + resp->spnego_blob_len; - } else if (entry->type == KSMBD_EVENT_SHARE_CONFIG_REQUEST) { + break; + } + case KSMBD_EVENT_SHARE_CONFIG_REQUEST: + { struct ksmbd_share_config_response *resp = entry->response; if (resp->payload_sz) { @@ -478,6 +500,17 @@ static int ipc_validate_msg(struct ipc_msg_table_entry *entry) msg_sz = sizeof(struct ksmbd_share_config_response) + resp->payload_sz; } + break; + } + case KSMBD_EVENT_LOGIN_REQUEST_EXT: + { + struct ksmbd_login_response_ext *resp = entry->response; + + if (resp->ngroups) { + msg_sz = sizeof(struct ksmbd_login_response_ext) + + resp->ngroups * sizeof(gid_t); + } + } } return entry->msg_sz != msg_sz ? -EINVAL : 0; @@ -560,6 +593,29 @@ struct ksmbd_login_response *ksmbd_ipc_login_request(const char *account) return resp; } +struct ksmbd_login_response_ext *ksmbd_ipc_login_request_ext(const char *account) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_login_request *req; + struct ksmbd_login_response_ext *resp; + + if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ) + return NULL; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_login_request)); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_LOGIN_REQUEST_EXT; + req = (struct ksmbd_login_request *)msg->payload; + req->handle = ksmbd_acquire_id(&ipc_ida); + strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_handle_free(req->handle); + ipc_msg_free(msg); + return resp; +} + struct ksmbd_spnego_authen_response * ksmbd_ipc_spnego_authen_request(const char *spnego_blob, int blob_len) { diff --git a/fs/smb/server/transport_ipc.h b/fs/smb/server/transport_ipc.h index 5e5b90a0c187..d9b6737f8cd0 100644 --- a/fs/smb/server/transport_ipc.h +++ b/fs/smb/server/transport_ipc.h @@ -12,6 +12,8 @@ struct ksmbd_login_response * ksmbd_ipc_login_request(const char *account); +struct ksmbd_login_response_ext * +ksmbd_ipc_login_request_ext(const char *account); struct ksmbd_session; struct ksmbd_share_config; diff --git a/fs/splice.c b/fs/splice.c index 06232d7e505f..2898fa1e9e63 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1564,21 +1564,6 @@ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, return ret; } -static int vmsplice_type(struct fd f, int *type) -{ - if (!fd_file(f)) - return -EBADF; - if (fd_file(f)->f_mode & FMODE_WRITE) { - *type = ITER_SOURCE; - } else if (fd_file(f)->f_mode & FMODE_READ) { - *type = ITER_DEST; - } else { - fdput(f); - return -EBADF; - } - return 0; -} - /* * Note that vmsplice only really supports true splicing _from_ user memory * to a pipe, not the other way around. Splicing from user memory is a simple @@ -1602,21 +1587,25 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, struct iovec *iov = iovstack; struct iov_iter iter; ssize_t error; - struct fd f; int type; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; - f = fdget(fd); - error = vmsplice_type(f, &type); - if (error) - return error; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + if (fd_file(f)->f_mode & FMODE_WRITE) + type = ITER_SOURCE; + else if (fd_file(f)->f_mode & FMODE_READ) + type = ITER_DEST; + else + return -EBADF; error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); if (error < 0) - goto out_fdput; + return error; if (!iov_iter_count(&iter)) error = 0; @@ -1626,8 +1615,6 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, error = vmsplice_to_user(fd_file(f), &iter, flags); kfree(iov); -out_fdput: - fdput(f); return error; } @@ -1635,27 +1622,22 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { - struct fd in, out; - ssize_t error; - if (unlikely(!len)) return 0; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; - error = -EBADF; - in = fdget(fd_in); - if (fd_file(in)) { - out = fdget(fd_out); - if (fd_file(out)) { - error = __do_splice(fd_file(in), off_in, fd_file(out), off_out, + CLASS(fd, in)(fd_in); + if (fd_empty(in)) + return -EBADF; + + CLASS(fd, out)(fd_out); + if (fd_empty(out)) + return -EBADF; + + return __do_splice(fd_file(in), off_in, fd_file(out), off_out, len, flags); - fdput(out); - } - fdput(in); - } - return error; } /* @@ -2005,25 +1987,19 @@ ssize_t do_tee(struct file *in, struct file *out, size_t len, SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { - struct fd in, out; - ssize_t error; - if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; if (unlikely(!len)) return 0; - error = -EBADF; - in = fdget(fdin); - if (fd_file(in)) { - out = fdget(fdout); - if (fd_file(out)) { - error = do_tee(fd_file(in), fd_file(out), len, flags); - fdput(out); - } - fdput(in); - } + CLASS(fd, in)(fdin); + if (fd_empty(in)) + return -EBADF; - return error; + CLASS(fd, out)(fdout); + if (fd_empty(out)) + return -EBADF; + + return do_tee(fd_file(in), fd_file(out), len, flags); } diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 22251743fadf..d19d4db74af8 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -30,7 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; loff_t start_index = folio->index & ~mask; loff_t end_index = start_index | mask; - int i, n, pages, bytes, res = -ENOMEM; + loff_t index; + int i, pages, bytes, res = -ENOMEM; struct page **page, *last_page; struct squashfs_page_actor *actor; void *pageaddr; @@ -45,9 +46,9 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, return res; /* Try to grab all the pages covered by the Squashfs block */ - for (i = 0, n = start_index; n <= end_index; n++) { - page[i] = (n == folio->index) ? target_page : - grab_cache_page_nowait(target_page->mapping, n); + for (i = 0, index = start_index; index <= end_index; index++) { + page[i] = (index == folio->index) ? target_page : + grab_cache_page_nowait(target_page->mapping, index); if (page[i] == NULL) continue; diff --git a/fs/stat.c b/fs/stat.c index 41e598376d7e..0870e969a8a0 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -23,10 +23,46 @@ #include <linux/uaccess.h> #include <asm/unistd.h> +#include <trace/events/timestamp.h> + #include "internal.h" #include "mount.h" /** + * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED + * @stat: where to store the resulting values + * @request_mask: STATX_* values requested + * @inode: inode from which to grab the c/mtime + * + * Given @inode, grab the ctime and mtime out if it and store the result + * in @stat. When fetching the value, flag it as QUERIED (if not already) + * so the next write will record a distinct timestamp. + * + * NB: The QUERIED flag is tracked in the ctime, but we set it there even + * if only the mtime was requested, as that ensures that the next mtime + * change will be distinct. + */ +void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) +{ + atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec; + + /* If neither time was requested, then don't report them */ + if (!(request_mask & (STATX_CTIME|STATX_MTIME))) { + stat->result_mask &= ~(STATX_CTIME|STATX_MTIME); + return; + } + + stat->mtime = inode_get_mtime(inode); + stat->ctime.tv_sec = inode->i_ctime_sec; + stat->ctime.tv_nsec = (u32)atomic_read(pcn); + if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED)) + stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn)); + stat->ctime.tv_nsec &= ~I_CTIME_QUERIED; + trace_fill_mg_cmtime(inode, &stat->ctime, &stat->mtime); +} +EXPORT_SYMBOL(fill_mg_cmtime); + +/** * generic_fillattr - Fill in the basic attributes from the inode struct * @idmap: idmap of the mount the inode was found from * @request_mask: statx request_mask @@ -58,8 +94,14 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode_get_atime(inode); - stat->mtime = inode_get_mtime(inode); - stat->ctime = inode_get_ctime(inode); + + if (is_mgtime(inode)) { + fill_mg_cmtime(stat, request_mask, inode); + } else { + stat->ctime = inode_get_ctime(inode); + stat->mtime = inode_get_mtime(inode); + } + stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; @@ -165,7 +207,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, if (inode->i_op->getattr) return inode->i_op->getattr(idmap, path, stat, request_mask, - query_flags | AT_GETATTR_NOSEC); + query_flags); generic_fillattr(idmap, request_mask, inode, stat); return 0; @@ -198,9 +240,6 @@ int vfs_getattr(const struct path *path, struct kstat *stat, { int retval; - if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC)) - return -EPERM; - retval = security_inode_getattr(path); if (retval) return retval; @@ -220,18 +259,13 @@ EXPORT_SYMBOL(vfs_getattr); */ int vfs_fstat(int fd, struct kstat *stat) { - struct fd f; - int error; - - f = fdget_raw(fd); - if (!fd_file(f)) + CLASS(fd_raw, f)(fd); + if (fd_empty(f)) return -EBADF; - error = vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0); - fdput(f); - return error; + return vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0); } -int getname_statx_lookup_flags(int flags) +static int statx_lookup_flags(int flags) { int lookup_flags = 0; @@ -239,8 +273,6 @@ int getname_statx_lookup_flags(int flags) lookup_flags |= LOOKUP_FOLLOW; if (!(flags & AT_NO_AUTOMOUNT)) lookup_flags |= LOOKUP_AUTOMOUNT; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; return lookup_flags; } @@ -277,7 +309,7 @@ static int vfs_statx_fd(int fd, int flags, struct kstat *stat, u32 request_mask) { CLASS(fd_raw, f)(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; return vfs_statx_path(&fd_file(f)->f_path, flags, stat, request_mask); } @@ -301,7 +333,7 @@ static int vfs_statx(int dfd, struct filename *filename, int flags, struct kstat *stat, u32 request_mask) { struct path path; - unsigned int lookup_flags = getname_statx_lookup_flags(flags); + unsigned int lookup_flags = statx_lookup_flags(flags); int error; if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | @@ -326,18 +358,11 @@ int vfs_fstatat(int dfd, const char __user *filename, { int ret; int statx_flags = flags | AT_NO_AUTOMOUNT; - struct filename *name; + struct filename *name = getname_maybe_null(filename, flags); - /* - * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH) - * - * If AT_EMPTY_PATH is set, we expect the common case to be that - * empty path, and avoid doing all the extra pathname work. - */ - if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename)) + if (!name && dfd >= 0) return vfs_fstat(dfd, stat); - name = getname_flags(filename, getname_statx_lookup_flags(statx_flags)); ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); putname(name); @@ -774,24 +799,11 @@ SYSCALL_DEFINE5(statx, struct statx __user *, buffer) { int ret; - unsigned lflags; - struct filename *name; + struct filename *name = getname_maybe_null(filename, flags); - /* - * Short-circuit handling of NULL and "" paths. - * - * For a NULL path we require and accept only the AT_EMPTY_PATH flag - * (possibly |'d with AT_STATX flags). - * - * However, glibc on 32-bit architectures implements fstatat as statx - * with the "" pathname and AT_NO_AUTOMOUNT | AT_EMPTY_PATH flags. - * Supporting this results in the uglification below. - */ - lflags = flags & ~(AT_NO_AUTOMOUNT | AT_STATX_SYNC_TYPE); - if (lflags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename)) + if (!name && dfd >= 0) return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer); - name = getname_flags(filename, getname_statx_lookup_flags(flags)); ret = do_statx(dfd, name, flags, mask, buffer); putname(name); diff --git a/fs/statfs.c b/fs/statfs.c index 9c7bb27e7932..a45ac85e6048 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -114,13 +114,11 @@ retry: int fd_statfs(int fd, struct kstatfs *st) { - struct fd f = fdget_raw(fd); - int error = -EBADF; - if (fd_file(f)) { - error = vfs_statfs(&fd_file(f)->f_path, st); - fdput(f); - } - return error; + CLASS(fd_raw, f)(fd); + + if (fd_empty(f)) + return -EBADF; + return vfs_statfs(&fd_file(f)->f_path, st); } static int do_statfs_native(struct kstatfs *st, struct statfs __user *p) diff --git a/fs/super.c b/fs/super.c index 1db230432960..c9c7223bc2a2 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1596,13 +1596,14 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, EXPORT_SYMBOL_GPL(setup_bdev_super); /** - * get_tree_bdev - Get a superblock based on a single block device + * get_tree_bdev_flags - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock + * @flags: GET_TREE_BDEV_* flags */ -int get_tree_bdev(struct fs_context *fc, - int (*fill_super)(struct super_block *, - struct fs_context *)) +int get_tree_bdev_flags(struct fs_context *fc, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc), unsigned int flags) { struct super_block *s; int error = 0; @@ -1613,10 +1614,10 @@ int get_tree_bdev(struct fs_context *fc, error = lookup_bdev(fc->source, &dev); if (error) { - errorf(fc, "%s: Can't lookup blockdev", fc->source); + if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP)) + errorf(fc, "%s: Can't lookup blockdev", fc->source); return error; } - fc->sb_flags |= SB_NOSEC; s = sget_dev(fc, dev); if (IS_ERR(s)) @@ -1644,6 +1645,19 @@ int get_tree_bdev(struct fs_context *fc, fc->root = dget(s->s_root); return 0; } +EXPORT_SYMBOL_GPL(get_tree_bdev_flags); + +/** + * get_tree_bdev - Get a superblock based on a single block device + * @fc: The filesystem context holding the parameters + * @fill_super: Helper to initialise a new superblock + */ +int get_tree_bdev(struct fs_context *fc, + int (*fill_super)(struct super_block *, + struct fs_context *)) +{ + return get_tree_bdev_flags(fc, fill_super, 0); +} EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) diff --git a/fs/sync.c b/fs/sync.c index 67df255eb189..2955cd4c77a3 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -148,11 +148,11 @@ void emergency_sync(void) */ SYSCALL_DEFINE1(syncfs, int, fd) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct super_block *sb; int ret, ret2; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; sb = fd_file(f)->f_path.dentry->d_sb; @@ -162,7 +162,6 @@ SYSCALL_DEFINE1(syncfs, int, fd) ret2 = errseq_check_and_advance(&sb->s_wb_err, &fd_file(f)->f_sb_err); - fdput(f); return ret ? ret : ret2; } @@ -205,14 +204,12 @@ EXPORT_SYMBOL(vfs_fsync); static int do_fsync(unsigned int fd, int datasync) { - struct fd f = fdget(fd); - int ret = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - ret = vfs_fsync(fd_file(f), datasync); - fdput(f); - } - return ret; + if (fd_empty(f)) + return -EBADF; + + return vfs_fsync(fd_file(f), datasync); } SYSCALL_DEFINE1(fsync, unsigned int, fd) @@ -355,16 +352,12 @@ out: int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes, unsigned int flags) { - int ret; - struct fd f; + CLASS(fd, f)(fd); - ret = -EBADF; - f = fdget(fd); - if (fd_file(f)) - ret = sync_file_range(fd_file(f), offset, nbytes, flags); + if (fd_empty(f)) + return -EBADF; - fdput(f); - return ret; + return sync_file_range(fd_file(f), offset, nbytes, flags); } SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, diff --git a/fs/timerfd.c b/fs/timerfd.c index 137523e0bb21..4c32244b0508 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -394,19 +394,6 @@ static const struct file_operations timerfd_fops = { .unlocked_ioctl = timerfd_ioctl, }; -static int timerfd_fget(int fd, struct fd *p) -{ - struct fd f = fdget(fd); - if (!fd_file(f)) - return -EBADF; - if (fd_file(f)->f_op != &timerfd_fops) { - fdput(f); - return -EINVAL; - } - *p = f; - return 0; -} - SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { int ufd; @@ -471,7 +458,6 @@ static int do_timerfd_settime(int ufd, int flags, const struct itimerspec64 *new, struct itimerspec64 *old) { - struct fd f; struct timerfd_ctx *ctx; int ret; @@ -479,15 +465,17 @@ static int do_timerfd_settime(int ufd, int flags, !itimerspec64_valid(new)) return -EINVAL; - ret = timerfd_fget(ufd, &f); - if (ret) - return ret; + CLASS(fd, f)(ufd); + if (fd_empty(f)) + return -EBADF; + + if (fd_file(f)->f_op != &timerfd_fops) + return -EINVAL; + ctx = fd_file(f)->private_data; - if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) { - fdput(f); + if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) return -EPERM; - } timerfd_setup_cancel(ctx, flags); @@ -535,17 +523,18 @@ static int do_timerfd_settime(int ufd, int flags, ret = timerfd_setup(ctx, flags, new); spin_unlock_irq(&ctx->wqh.lock); - fdput(f); return ret; } static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) { - struct fd f; struct timerfd_ctx *ctx; - int ret = timerfd_fget(ufd, &f); - if (ret) - return ret; + CLASS(fd, f)(ufd); + + if (fd_empty(f)) + return -EBADF; + if (fd_file(f)->f_op != &timerfd_fops) + return -EINVAL; ctx = fd_file(f)->private_data; spin_lock_irq(&ctx->wqh.lock); @@ -567,7 +556,6 @@ static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); t->it_interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); - fdput(f); return 0; } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 1748dff58c3b..cfc614c638da 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -392,6 +392,9 @@ static int tracefs_reconfigure(struct fs_context *fc) struct tracefs_fs_info *sb_opts = sb->s_fs_info; struct tracefs_fs_info *new_opts = fc->s_fs_info; + if (!new_opts) + return 0; + sync_filesystem(sb); /* structure copy of new mount options to sb */ *sb_opts = *new_opts; @@ -478,14 +481,17 @@ static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &tracefs_super_operations; sb->s_d_op = &tracefs_dentry_operations; - tracefs_apply_options(sb, false); - return 0; } static int tracefs_get_tree(struct fs_context *fc) { - return get_tree_single(fc, tracefs_fill_super); + int err = get_tree_single(fc, tracefs_fill_super); + + if (err) + return err; + + return tracefs_reconfigure(fc); } static void tracefs_free_fc(struct fs_context *fc) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 291583005dd1..3fb308b6e167 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -19,9 +19,9 @@ #include <linux/module.h> #include <linux/ctype.h> #include <linux/kthread.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/seq_file.h> -#include <linux/mount.h> #include <linux/math64.h> #include <linux/writeback.h> #include "ubifs.h" @@ -981,177 +981,120 @@ enum { Opt_auth_key, Opt_auth_hash_name, Opt_ignore, - Opt_err, }; -static const match_table_t tokens = { - {Opt_fast_unmount, "fast_unmount"}, - {Opt_norm_unmount, "norm_unmount"}, - {Opt_bulk_read, "bulk_read"}, - {Opt_no_bulk_read, "no_bulk_read"}, - {Opt_chk_data_crc, "chk_data_crc"}, - {Opt_no_chk_data_crc, "no_chk_data_crc"}, - {Opt_override_compr, "compr=%s"}, - {Opt_auth_key, "auth_key=%s"}, - {Opt_auth_hash_name, "auth_hash_name=%s"}, - {Opt_ignore, "ubi=%s"}, - {Opt_ignore, "vol=%s"}, - {Opt_assert, "assert=%s"}, - {Opt_err, NULL}, +static const struct constant_table ubifs_param_compr[] = { + { "none", UBIFS_COMPR_NONE }, + { "lzo", UBIFS_COMPR_LZO }, + { "zlib", UBIFS_COMPR_ZLIB }, + { "zstd", UBIFS_COMPR_ZSTD }, + {} }; -/** - * parse_standard_option - parse a standard mount option. - * @option: the option to parse - * - * Normally, standard mount options like "sync" are passed to file-systems as - * flags. However, when a "rootflags=" kernel boot parameter is used, they may - * be present in the options string. This function tries to deal with this - * situation and parse standard options. Returns 0 if the option was not - * recognized, and the corresponding integer flag if it was. - * - * UBIFS is only interested in the "sync" option, so do not check for anything - * else. - */ -static int parse_standard_option(const char *option) -{ +static const struct constant_table ubifs_param_assert[] = { + { "report", ASSACT_REPORT }, + { "read-only", ASSACT_RO }, + { "panic", ASSACT_PANIC }, + {} +}; - pr_notice("UBIFS: parse %s\n", option); - if (!strcmp(option, "sync")) - return SB_SYNCHRONOUS; - return 0; -} +static const struct fs_parameter_spec ubifs_fs_param_spec[] = { + fsparam_flag ("fast_unmount", Opt_fast_unmount), + fsparam_flag ("norm_unmount", Opt_norm_unmount), + fsparam_flag ("bulk_read", Opt_bulk_read), + fsparam_flag ("no_bulk_read", Opt_no_bulk_read), + fsparam_flag ("chk_data_crc", Opt_chk_data_crc), + fsparam_flag ("no_chk_data_crc", Opt_no_chk_data_crc), + fsparam_enum ("compr", Opt_override_compr, ubifs_param_compr), + fsparam_enum ("assert", Opt_assert, ubifs_param_assert), + fsparam_string ("auth_key", Opt_auth_key), + fsparam_string ("auth_hash_name", Opt_auth_hash_name), + fsparam_string ("ubi", Opt_ignore), + fsparam_string ("vol", Opt_ignore), + {} +}; + +struct ubifs_fs_context { + struct ubifs_mount_opts mount_opts; + char *auth_key_name; + char *auth_hash_name; + unsigned int no_chk_data_crc:1; + unsigned int bulk_read:1; + unsigned int default_compr:2; + unsigned int assert_action:2; +}; /** - * ubifs_parse_options - parse mount parameters. - * @c: UBIFS file-system description object - * @options: parameters to parse - * @is_remount: non-zero if this is FS re-mount + * ubifs_parse_param - parse a parameter. + * @fc: the filesystem context + * @param: the parameter to parse * * This function parses UBIFS mount options and returns zero in case success * and a negative error code in case of failure. */ -static int ubifs_parse_options(struct ubifs_info *c, char *options, - int is_remount) +static int ubifs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - - if (!options) - return 0; + struct ubifs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE); + int opt; - while ((p = strsep(&options, ","))) { - int token; + opt = fs_parse(fc, ubifs_fs_param_spec, param, &result); + if (opt < 0) + return opt; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { + switch (opt) { /* * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. * We accept them in order to be backward-compatible. But this * should be removed at some point. */ - case Opt_fast_unmount: - c->mount_opts.unmount_mode = 2; - break; - case Opt_norm_unmount: - c->mount_opts.unmount_mode = 1; - break; - case Opt_bulk_read: - c->mount_opts.bulk_read = 2; - c->bulk_read = 1; - break; - case Opt_no_bulk_read: - c->mount_opts.bulk_read = 1; - c->bulk_read = 0; - break; - case Opt_chk_data_crc: - c->mount_opts.chk_data_crc = 2; - c->no_chk_data_crc = 0; - break; - case Opt_no_chk_data_crc: - c->mount_opts.chk_data_crc = 1; - c->no_chk_data_crc = 1; - break; - case Opt_override_compr: - { - char *name = match_strdup(&args[0]); - - if (!name) - return -ENOMEM; - if (!strcmp(name, "none")) - c->mount_opts.compr_type = UBIFS_COMPR_NONE; - else if (!strcmp(name, "lzo")) - c->mount_opts.compr_type = UBIFS_COMPR_LZO; - else if (!strcmp(name, "zlib")) - c->mount_opts.compr_type = UBIFS_COMPR_ZLIB; - else if (!strcmp(name, "zstd")) - c->mount_opts.compr_type = UBIFS_COMPR_ZSTD; - else { - ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready? - kfree(name); - return -EINVAL; - } - kfree(name); - c->mount_opts.override_compr = 1; - c->default_compr = c->mount_opts.compr_type; - break; - } - case Opt_assert: - { - char *act = match_strdup(&args[0]); - - if (!act) - return -ENOMEM; - if (!strcmp(act, "report")) - c->assert_action = ASSACT_REPORT; - else if (!strcmp(act, "read-only")) - c->assert_action = ASSACT_RO; - else if (!strcmp(act, "panic")) - c->assert_action = ASSACT_PANIC; - else { - ubifs_err(c, "unknown assert action \"%s\"", act); - kfree(act); - return -EINVAL; - } - kfree(act); - break; - } - case Opt_auth_key: - if (!is_remount) { - c->auth_key_name = kstrdup(args[0].from, - GFP_KERNEL); - if (!c->auth_key_name) - return -ENOMEM; - } - break; - case Opt_auth_hash_name: - if (!is_remount) { - c->auth_hash_name = kstrdup(args[0].from, - GFP_KERNEL); - if (!c->auth_hash_name) - return -ENOMEM; - } - break; - case Opt_ignore: - break; - default: - { - unsigned long flag; - struct super_block *sb = c->vfs_sb; - - flag = parse_standard_option(p); - if (!flag) { - ubifs_err(c, "unrecognized mount option \"%s\" or missing value", - p); - return -EINVAL; - } - sb->s_flags |= flag; - break; + case Opt_fast_unmount: + ctx->mount_opts.unmount_mode = 2; + break; + case Opt_norm_unmount: + ctx->mount_opts.unmount_mode = 1; + break; + case Opt_bulk_read: + ctx->mount_opts.bulk_read = 2; + ctx->bulk_read = 1; + break; + case Opt_no_bulk_read: + ctx->mount_opts.bulk_read = 1; + ctx->bulk_read = 0; + break; + case Opt_chk_data_crc: + ctx->mount_opts.chk_data_crc = 2; + ctx->no_chk_data_crc = 0; + break; + case Opt_no_chk_data_crc: + ctx->mount_opts.chk_data_crc = 1; + ctx->no_chk_data_crc = 1; + break; + case Opt_override_compr: + ctx->mount_opts.compr_type = result.uint_32; + ctx->mount_opts.override_compr = 1; + ctx->default_compr = ctx->mount_opts.compr_type; + break; + case Opt_assert: + ctx->assert_action = result.uint_32; + break; + case Opt_auth_key: + if (!is_remount) { + kfree(ctx->auth_key_name); + ctx->auth_key_name = param->string; + param->string = NULL; } + break; + case Opt_auth_hash_name: + if (!is_remount) { + kfree(ctx->auth_hash_name); + ctx->auth_hash_name = param->string; + param->string = NULL; } + break; + case Opt_ignore: + break; } return 0; @@ -2003,21 +1946,27 @@ static void ubifs_put_super(struct super_block *sb) mutex_unlock(&c->umount_mutex); } -static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) +static int ubifs_reconfigure(struct fs_context *fc) { + struct ubifs_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; int err; struct ubifs_info *c = sb->s_fs_info; sync_filesystem(sb); - dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); + dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, fc->sb_flags); - err = ubifs_parse_options(c, data, 1); - if (err) { - ubifs_err(c, "invalid or unknown remount parameter"); - return err; - } + /* + * Apply the mount option changes. + * auth_key_name and auth_hash_name are ignored on remount. + */ + c->mount_opts = ctx->mount_opts; + c->bulk_read = ctx->bulk_read; + c->no_chk_data_crc = ctx->no_chk_data_crc; + c->default_compr = ctx->default_compr; + c->assert_action = ctx->assert_action; - if (c->ro_mount && !(*flags & SB_RDONLY)) { + if (c->ro_mount && !(fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/W due to prior errors"); return -EROFS; @@ -2029,7 +1978,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) err = ubifs_remount_rw(c); if (err) return err; - } else if (!c->ro_mount && (*flags & SB_RDONLY)) { + } else if (!c->ro_mount && (fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/O due to prior errors"); return -EROFS; @@ -2062,14 +2011,13 @@ const struct super_operations ubifs_super_operations = { .evict_inode = ubifs_evict_inode, .statfs = ubifs_statfs, .dirty_inode = ubifs_dirty_inode, - .remount_fs = ubifs_remount_fs, .show_options = ubifs_show_options, .sync_fs = ubifs_sync_fs, }; /** * open_ubi - parse UBI device name string and open the UBI device. - * @name: UBI volume name + * @fc: The filesystem context * @mode: UBI volume open mode * * The primary method of mounting UBIFS is by specifying the UBI volume @@ -2086,15 +2034,13 @@ const struct super_operations ubifs_super_operations = { * returns UBI volume description object in case of success and a negative * error code in case of failure. */ -static struct ubi_volume_desc *open_ubi(const char *name, int mode) +static struct ubi_volume_desc *open_ubi(struct fs_context *fc, int mode) { struct ubi_volume_desc *ubi; + const char *name = fc->source; int dev, vol; char *endptr; - if (!name || !*name) - return ERR_PTR(-EINVAL); - /* First, try to open using the device node path method */ ubi = ubi_open_volume_path(name, mode); if (!IS_ERR(ubi)) @@ -2102,14 +2048,14 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) /* Try the "nodev" method */ if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') - return ERR_PTR(-EINVAL); + goto invalid_source; /* ubi:NAME method */ if ((name[3] == ':' || name[3] == '!') && name[4] != '\0') return ubi_open_volume_nm(0, name + 4, mode); if (!isdigit(name[3])) - return ERR_PTR(-EINVAL); + goto invalid_source; dev = simple_strtoul(name + 3, &endptr, 0); @@ -2121,7 +2067,7 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) if (*endptr == '_' && isdigit(endptr[1])) { vol = simple_strtoul(endptr + 1, &endptr, 0); if (*endptr != '\0') - return ERR_PTR(-EINVAL); + goto invalid_source; return ubi_open_volume(dev, vol, mode); } @@ -2129,7 +2075,8 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0') return ubi_open_volume_nm(dev, ++endptr, mode); - return ERR_PTR(-EINVAL); +invalid_source: + return ERR_PTR(invalf(fc, "Invalid source name")); } static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) @@ -2181,9 +2128,10 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) return c; } -static int ubifs_fill_super(struct super_block *sb, void *data, int silent) +static int ubifs_fill_super(struct super_block *sb, struct fs_context *fc) { struct ubifs_info *c = sb->s_fs_info; + struct ubifs_fs_context *ctx = fc->fs_private; struct inode *root; int err; @@ -2195,9 +2143,18 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out; } - err = ubifs_parse_options(c, data, 0); - if (err) - goto out_close; + /* Copy in parsed mount options */ + c->mount_opts = ctx->mount_opts; + c->auth_key_name = ctx->auth_key_name; + c->auth_hash_name = ctx->auth_hash_name; + c->no_chk_data_crc = ctx->no_chk_data_crc; + c->bulk_read = ctx->bulk_read; + c->default_compr = ctx->default_compr; + c->assert_action = ctx->assert_action; + + /* ubifs_info owns auth strings now */ + ctx->auth_key_name = NULL; + ctx->auth_hash_name = NULL; /* * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For @@ -2264,41 +2221,38 @@ out: return err; } -static int sb_test(struct super_block *sb, void *data) +static int sb_test(struct super_block *sb, struct fs_context *fc) { - struct ubifs_info *c1 = data; + struct ubifs_info *c1 = fc->s_fs_info; struct ubifs_info *c = sb->s_fs_info; return c->vi.cdev == c1->vi.cdev; } -static int sb_set(struct super_block *sb, void *data) -{ - sb->s_fs_info = data; - return set_anon_super(sb, NULL); -} - -static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, - const char *name, void *data) +static int ubifs_get_tree(struct fs_context *fc) { struct ubi_volume_desc *ubi; struct ubifs_info *c; struct super_block *sb; int err; - dbg_gen("name %s, flags %#x", name, flags); + if (!fc->source || !*fc->source) + return invalf(fc, "No source specified"); + + dbg_gen("name %s, flags %#x", fc->source, fc->sb_flags); /* * Get UBI device number and volume ID. Mount it read-only so far * because this might be a new mount point, and UBI allows only one * read-write user at a time. */ - ubi = open_ubi(name, UBI_READONLY); + ubi = open_ubi(fc, UBI_READONLY); if (IS_ERR(ubi)) { - if (!(flags & SB_SILENT)) + err = PTR_ERR(ubi); + if (!(fc->sb_flags & SB_SILENT)) pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", - current->pid, name, (int)PTR_ERR(ubi)); - return ERR_CAST(ubi); + current->pid, fc->source, err); + return err; } c = alloc_ubifs_info(ubi); @@ -2306,10 +2260,11 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, err = -ENOMEM; goto out_close; } + fc->s_fs_info = c; dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); - sb = sget(fs_type, sb_test, sb_set, flags, c); + sb = sget_fc(fc, sb_test, set_anon_super_fc); if (IS_ERR(sb)) { err = PTR_ERR(sb); kfree(c); @@ -2321,12 +2276,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); - if (!!(flags & SB_RDONLY) != c1->ro_mount) { + if (!!(fc->sb_flags & SB_RDONLY) != c1->ro_mount) { err = -EBUSY; goto out_deact; } } else { - err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0); + err = ubifs_fill_super(sb, fc); if (err) goto out_deact; /* We do not support atime */ @@ -2340,13 +2295,14 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, /* 'fill_super()' opens ubi again so we must close it here */ ubi_close_volume(ubi); - return dget(sb->s_root); + fc->root = dget(sb->s_root); + return 0; out_deact: deactivate_locked_super(sb); out_close: ubi_close_volume(ubi); - return ERR_PTR(err); + return err; } static void kill_ubifs_super(struct super_block *s) @@ -2356,10 +2312,61 @@ static void kill_ubifs_super(struct super_block *s) kfree(c); } +static void ubifs_free_fc(struct fs_context *fc) +{ + struct ubifs_fs_context *ctx = fc->fs_private; + + if (ctx) { + kfree(ctx->auth_key_name); + kfree(ctx->auth_hash_name); + kfree(ctx); + } +} + +static const struct fs_context_operations ubifs_context_ops = { + .free = ubifs_free_fc, + .parse_param = ubifs_parse_param, + .get_tree = ubifs_get_tree, + .reconfigure = ubifs_reconfigure, +}; + +static int ubifs_init_fs_context(struct fs_context *fc) +{ + struct ubifs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct ubifs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) { + /* Iniitialize for first mount */ + ctx->no_chk_data_crc = 1; + ctx->assert_action = ASSACT_RO; + } else { + struct ubifs_info *c = fc->root->d_sb->s_fs_info; + + /* + * Preserve existing options across remounts. + * auth_key_name and auth_hash_name are not remountable. + */ + ctx->mount_opts = c->mount_opts; + ctx->bulk_read = c->bulk_read; + ctx->no_chk_data_crc = c->no_chk_data_crc; + ctx->default_compr = c->default_compr; + ctx->assert_action = c->assert_action; + } + + fc->ops = &ubifs_context_ops; + fc->fs_private = ctx; + + return 0; +} + static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, - .mount = ubifs_mount, + .init_fs_context = ubifs_init_fs_context, + .parameters = ubifs_fs_param_spec, .kill_sb = kill_ubifs_super, }; MODULE_ALIAS_FS("ubifs"); diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 53c11be2b2c1..194ed3ab945e 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -33,6 +33,29 @@ static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info * static unsigned char ufs_fragtable_8fpb[], ufs_fragtable_other[]; static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *, unsigned, int); +static void adjust_free_blocks(struct super_block *sb, + struct ufs_cylinder_group *ucg, + struct ufs_cg_private_info *ucpi, + unsigned fragment, int delta) +{ + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + + if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) + ufs_clusteracct(sb, ucpi, fragment, delta); + + fs32_add(sb, &ucg->cg_cs.cs_nbfree, delta); + uspi->cs_total.cs_nbfree += delta; + fs32_add(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, delta); + + if (uspi->fs_magic != UFS2_MAGIC) { + unsigned cylno = ufs_cbtocylno(fragment); + + fs16_add(sb, &ubh_cg_blks(ucpi, cylno, + ufs_cbtorpos(fragment)), delta); + fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), delta); + } +} + /* * Free 'count' fragments from fragment number 'fragment' */ @@ -43,7 +66,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, bit, end_bit, bbase, blkmap, i; - u64 blkno; sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -51,7 +73,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) UFSD("ENTER, fragment %llu, count %u\n", (unsigned long long)fragment, count); - if (ufs_fragnum(fragment) + count > uspi->s_fpg) + if (ufs_fragnum(fragment) + count > uspi->s_fpb) ufs_error (sb, "ufs_free_fragments", "internal error"); mutex_lock(&UFS_SB(sb)->s_lock); @@ -94,23 +116,11 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) /* * Trying to reassemble free fragments into block */ - blkno = ufs_fragstoblks (bbase); - if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) { + if (ubh_isblockset(uspi, ucpi, bbase)) { fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb); uspi->cs_total.cs_nffree -= uspi->s_fpb; fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb); - if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) - ufs_clusteracct (sb, ucpi, blkno, 1); - fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); - uspi->cs_total.cs_nbfree++; - fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); - if (uspi->fs_magic != UFS2_MAGIC) { - unsigned cylno = ufs_cbtocylno (bbase); - - fs16_add(sb, &ubh_cg_blks(ucpi, cylno, - ufs_cbtorpos(bbase)), 1); - fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); - } + adjust_free_blocks(sb, ucg, ucpi, bbase, 1); } ubh_mark_buffer_dirty (USPI_UBH(uspi)); @@ -139,7 +149,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned overflow, cgno, bit, end_bit, i; - u64 blkno; sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -181,26 +190,12 @@ do_more: } for (i = bit; i < end_bit; i += uspi->s_fpb) { - blkno = ufs_fragstoblks(i); - if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) { + if (ubh_isblockset(uspi, ucpi, i)) { ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); } - ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + ubh_setblock(uspi, ucpi, i); inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift); - if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) - ufs_clusteracct (sb, ucpi, blkno, 1); - - fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); - uspi->cs_total.cs_nbfree++; - fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); - - if (uspi->fs_magic != UFS2_MAGIC) { - unsigned cylno = ufs_cbtocylno(i); - - fs16_add(sb, &ubh_cg_blks(ucpi, cylno, - ufs_cbtorpos(i)), 1); - fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); - } + adjust_free_blocks(sb, ucg, ucpi, i, 1); } ubh_mark_buffer_dirty (USPI_UBH(uspi)); @@ -234,13 +229,13 @@ failed: * situated at the end of file. * * We can come here from ufs_writepage or ufs_prepare_write, - * locked_page is argument of these functions, so we already lock it. + * locked_folio is argument of these functions, so we already lock it. */ static void ufs_change_blocknr(struct inode *inode, sector_t beg, unsigned int count, sector_t oldb, - sector_t newb, struct page *locked_page) + sector_t newb, struct folio *locked_folio) { - struct folio *folio, *locked_folio = page_folio(locked_page); + struct folio *folio; const unsigned blks_per_page = 1 << (PAGE_SHIFT - inode->i_blkbits); const unsigned mask = blks_per_page - 1; @@ -337,7 +332,7 @@ static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n, u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, u64 goal, unsigned count, int *err, - struct page *locked_page) + struct folio *locked_folio) { struct super_block * sb; struct ufs_sb_private_info * uspi; @@ -417,7 +412,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, result = ufs_alloc_fragments (inode, cgno, goal, count, err); if (result) { ufs_clear_frags(inode, result + oldcount, - newcount - oldcount, locked_page != NULL); + newcount - oldcount, locked_folio != NULL); *err = 0; write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); @@ -441,7 +436,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, fragment + count); read_sequnlock_excl(&UFS_I(inode)->meta_lock); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, - locked_page != NULL); + locked_folio != NULL); mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; @@ -462,11 +457,11 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, result = ufs_alloc_fragments (inode, cgno, goal, request, err); if (result) { ufs_clear_frags(inode, result + oldcount, newcount - oldcount, - locked_page != NULL); + locked_folio != NULL); mutex_unlock(&UFS_SB(sb)->s_lock); ufs_change_blocknr(inode, fragment - oldcount, oldcount, uspi->s_sbbase + tmp, - uspi->s_sbbase + result, locked_page); + uspi->s_sbbase + result, locked_folio); *err = 0; write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); @@ -698,7 +693,7 @@ static u64 ufs_alloccg_block(struct inode *inode, struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_cylinder_group * ucg; - u64 result, blkno; + u64 result; UFSD("ENTER, goal %llu\n", (unsigned long long)goal); @@ -716,7 +711,7 @@ static u64 ufs_alloccg_block(struct inode *inode, /* * If the requested block is available, use it. */ - if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) { + if (ubh_isblockset(uspi, ucpi, goal)) { result = goal; goto gotit; } @@ -729,22 +724,8 @@ norot: gotit: if (!try_add_frags(inode, uspi->s_fpb)) return 0; - blkno = ufs_fragstoblks(result); - ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); - if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) - ufs_clusteracct (sb, ucpi, blkno, -1); - - fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); - uspi->cs_total.cs_nbfree--; - fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1); - - if (uspi->fs_magic != UFS2_MAGIC) { - unsigned cylno = ufs_cbtocylno((unsigned)result); - - fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, - ufs_cbtorpos((unsigned)result)), 1); - fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1); - } + ubh_clrblock(uspi, ucpi, result); + adjust_free_blocks(sb, ucg, ucpi, result, -1); UFSD("EXIT, result %llu\n", (unsigned long long)result); @@ -863,12 +844,12 @@ static u64 ufs_bitmap_search(struct super_block *sb, } static void ufs_clusteracct(struct super_block * sb, - struct ufs_cg_private_info * ucpi, unsigned blkno, int cnt) + struct ufs_cg_private_info * ucpi, unsigned frag, int cnt) { - struct ufs_sb_private_info * uspi; + struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; int i, start, end, forw, back; + unsigned blkno = ufs_fragstoblks(frag); - uspi = UFS_SB(sb)->s_uspi; if (uspi->s_contigsumsize <= 0) return; diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c index 1abe5454de47..a2813270c303 100644 --- a/fs/ufs/cylinder.c +++ b/fs/ufs/cylinder.c @@ -26,7 +26,7 @@ * Read cylinder group into cache. The memory space for ufs_cg_private_info * structure is already allocated during ufs_read_super. */ -static void ufs_read_cylinder (struct super_block * sb, +static bool ufs_read_cylinder(struct super_block *sb, unsigned cgno, unsigned bitmap_nr) { struct ufs_sb_info * sbi = UFS_SB(sb); @@ -46,9 +46,11 @@ static void ufs_read_cylinder (struct super_block * sb, * We have already the first fragment of cylinder group block in buffer */ UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno]; - for (i = 1; i < UCPI_UBH(ucpi)->count; i++) - if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i))) + for (i = 1; i < UCPI_UBH(ucpi)->count; i++) { + UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i); + if (!UCPI_UBH(ucpi)->bh[i]) goto failed; + } sbi->s_cgno[bitmap_nr] = cgno; ucpi->c_cgx = fs32_to_cpu(sb, ucg->cg_cgx); @@ -67,13 +69,14 @@ static void ufs_read_cylinder (struct super_block * sb, ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff); ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks); UFSD("EXIT\n"); - return; + return true; failed: for (j = 1; j < i; j++) - brelse (sbi->s_ucg[j]); + brelse(UCPI_UBH(ucpi)->bh[j]); sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY; ufs_error (sb, "ufs_read_cylinder", "can't read cylinder group block %u", cgno); + return false; } /* @@ -156,15 +159,14 @@ struct ufs_cg_private_info * ufs_load_cylinder ( UFSD("EXIT (FAILED)\n"); return NULL; } - else { - UFSD("EXIT\n"); - return sbi->s_ucpi[cgno]; - } } else { - ufs_read_cylinder (sb, cgno, cgno); - UFSD("EXIT\n"); - return sbi->s_ucpi[cgno]; + if (unlikely(!ufs_read_cylinder (sb, cgno, cgno))) { + UFSD("EXIT (FAILED)\n"); + return NULL; + } } + UFSD("EXIT\n"); + return sbi->s_ucpi[cgno]; } /* * Cylinder group number cg is in cache but it was not last used, @@ -195,7 +197,10 @@ struct ufs_cg_private_info * ufs_load_cylinder ( sbi->s_ucpi[j] = sbi->s_ucpi[j-1]; } sbi->s_ucpi[0] = ucpi; - ufs_read_cylinder (sb, cgno, 0); + if (unlikely(!ufs_read_cylinder (sb, cgno, 0))) { + UFSD("EXIT (FAILED)\n"); + return NULL; + } } UFSD("EXIT\n"); return sbi->s_ucpi[0]; diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index d6e6a2198971..88d0062cfdb9 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -81,10 +81,9 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) } -/* Releases the page */ -void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct folio *folio, struct inode *inode, - bool update_times) +int ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct folio *folio, struct inode *inode, + bool update_times) { loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen); @@ -92,17 +91,19 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, folio_lock(folio); err = ufs_prepare_chunk(folio, pos, len); - BUG_ON(err); + if (unlikely(err)) { + folio_unlock(folio); + return err; + } de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); ufs_set_de_type(dir->i_sb, de, inode->i_mode); ufs_commit_chunk(folio, pos, len); - folio_release_kmap(folio, de); if (update_times) inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); - ufs_handle_dirsync(dir); + return ufs_handle_dirsync(dir); } static bool ufs_check_folio(struct folio *folio, char *kaddr) @@ -505,8 +506,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, if (de->d_reclen == 0) { ufs_error(inode->i_sb, __func__, "zero-length directory entry"); - err = -EIO; - goto out; + return -EIO; } pde = de; de = ufs_next_entry(sb, de); @@ -516,18 +516,17 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, pos = folio_pos(folio) + from; folio_lock(folio); err = ufs_prepare_chunk(folio, pos, to - from); - BUG_ON(err); + if (unlikely(err)) { + folio_unlock(folio); + return err; + } if (pde) pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; ufs_commit_chunk(folio, pos, to - from); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); - err = ufs_handle_dirsync(inode); -out: - folio_release_kmap(folio, kaddr); - UFSD("EXIT\n"); - return err; + return ufs_handle_dirsync(inode); } int ufs_make_empty(struct inode * inode, struct inode *dir) diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 6558882a89ef..487ad1fc2de6 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -42,4 +42,5 @@ const struct file_operations ufs_file_operations = { .open = generic_file_open, .fsync = generic_file_fsync, .splice_read = filemap_splice_read, + .splice_write = iter_file_splice_write, }; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 5331ae7ebf3e..7dc38fdef2ea 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -220,7 +220,7 @@ changed: */ static bool ufs_extend_tail(struct inode *inode, u64 writes_to, - int *err, struct page *locked_page) + int *err, struct folio *locked_folio) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; @@ -239,7 +239,7 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, p = ufs_get_direct_data_ptr(uspi, ufsi, block); tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), new_size - (lastfrag & uspi->s_fpbmask), err, - locked_page); + locked_folio); return tmp != 0; } @@ -250,12 +250,11 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, * @new_fragment: number of new allocated fragment(s) * @err: we set it if something wrong * @new: we set it if we allocate new block - * @locked_page: for ufs_new_fragments() + * @locked_folio: for ufs_new_fragments() */ -static u64 -ufs_inode_getfrag(struct inode *inode, unsigned index, +static u64 ufs_inode_getfrag(struct inode *inode, unsigned index, sector_t new_fragment, int *err, - int *new, struct page *locked_page) + int *new, struct folio *locked_folio) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; @@ -264,11 +263,6 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, unsigned nfrags = uspi->s_fpb; void *p; - /* TODO : to be done for write support - if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) - goto ufs2; - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, index); tmp = ufs_data_ptr_to_cpu(sb, p); if (tmp) @@ -288,7 +282,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, goal += uspi->s_fpb; } tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), - goal, nfrags, err, locked_page); + goal, nfrags, err, locked_folio); if (!tmp) { *err = -ENOSPC; @@ -303,21 +297,6 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, mark_inode_dirty(inode); out: return tmp + uspi->s_sbbase; - - /* This part : To be implemented .... - Required only for writing, not required for READ-ONLY. -ufs2: - - u2_block = ufs_fragstoblks(fragment); - u2_blockoff = ufs_fragnum(fragment); - p = ufsi->i_u1.u2_i_data + block; - goal = 0; - -repeat2: - tmp = fs32_to_cpu(sb, *p); - lastfrag = ufsi->i_lastfrag; - - */ } /** @@ -329,12 +308,11 @@ repeat2: * (block will hold this fragment and also uspi->s_fpb-1) * @err: see ufs_inode_getfrag() * @new: see ufs_inode_getfrag() - * @locked_page: see ufs_inode_getfrag() + * @locked_folio: see ufs_inode_getfrag() */ -static u64 -ufs_inode_getblock(struct inode *inode, u64 ind_block, - unsigned index, sector_t new_fragment, int *err, - int *new, struct page *locked_page) +static u64 ufs_inode_getblock(struct inode *inode, u64 ind_block, + unsigned index, sector_t new_fragment, int *err, + int *new, struct folio *locked_folio) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -369,7 +347,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block, else goal = bh->b_blocknr + uspi->s_fpb; tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal, - uspi->s_fpb, err, locked_page); + uspi->s_fpb, err, locked_folio); if (!tmp) goto out; @@ -434,14 +412,14 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff unsigned tailfrags = lastfrag & uspi->s_fpbmask; if (tailfrags && fragment >= lastfrag) { if (!ufs_extend_tail(inode, fragment, - &err, bh_result->b_page)) + &err, bh_result->b_folio)) goto out; } } if (depth == 1) { phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, - &err, &new, bh_result->b_page); + &err, &new, bh_result->b_folio); } else { int i; phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, @@ -450,7 +428,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff phys64 = ufs_inode_getblock(inode, phys64, offsets[i], fragment, &err, NULL, NULL); phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1], - fragment, &err, &new, bh_result->b_page); + fragment, &err, &new, bh_result->b_folio); } out: if (phys64) { @@ -898,91 +876,84 @@ static inline void free_data(struct to_free *ctx, u64 from, unsigned count) #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) +/* + * used only for truncation down to direct blocks. + */ static void ufs_trunc_direct(struct inode *inode) { struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; - void *p; - u64 frag1, frag2, frag3, frag4, block1, block2; + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned int new_frags, old_frags; + unsigned int old_slot, new_slot; + unsigned int old_tail, new_tail; struct to_free ctx = {.inode = inode}; - unsigned i, tmp; UFSD("ENTER: ino %lu\n", inode->i_ino); - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - frag1 = DIRECT_FRAGMENT; - frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); - frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); - frag3 = frag4 & ~uspi->s_fpbmask; - block1 = block2 = 0; - if (frag2 > frag3) { - frag2 = frag4; - frag3 = frag4 = 0; - } else if (frag2 < frag3) { - block1 = ufs_fragstoblks (frag2); - block2 = ufs_fragstoblks (frag3); - } - - UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," - " frag3 %llu, frag4 %llu\n", inode->i_ino, - (unsigned long long)frag1, (unsigned long long)frag2, - (unsigned long long)block1, (unsigned long long)block2, - (unsigned long long)frag3, (unsigned long long)frag4); - - if (frag1 >= frag2) - goto next1; + new_frags = DIRECT_FRAGMENT; + // new_frags = first fragment past the new EOF + old_frags = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); + // old_frags = first fragment past the old EOF or covered by indirects - /* - * Free first free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic (sb, "ufs_trunc_direct", "internal error"); - frag2 -= frag1; - frag1 = ufs_fragnum (frag1); + if (new_frags >= old_frags) // expanding - nothing to free + goto done; - ufs_free_fragments(inode, tmp + frag1, frag2); + old_tail = ufs_fragnum(old_frags); + old_slot = ufs_fragstoblks(old_frags); + new_tail = ufs_fragnum(new_frags); + new_slot = ufs_fragstoblks(new_frags); -next1: - /* - * Free whole blocks - */ - for (i = block1 ; i < block2; i++) { - p = ufs_get_direct_data_ptr(uspi, ufsi, i); - tmp = ufs_data_ptr_to_cpu(sb, p); + if (old_slot == new_slot) { // old_tail > 0 + void *p = ufs_get_direct_data_ptr(uspi, ufsi, old_slot); + u64 tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp) - continue; - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); + ufs_panic(sb, __func__, "internal error"); + if (!new_tail) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + } + ufs_free_fragments(inode, tmp + new_tail, old_tail - new_tail); + } else { + unsigned int slot = new_slot; - free_data(&ctx, tmp, uspi->s_fpb); - } + if (new_tail) { + void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot++); + u64 tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + ufs_panic(sb, __func__, "internal error"); - free_data(&ctx, 0, 0); + ufs_free_fragments(inode, tmp + new_tail, + uspi->s_fpb - new_tail); + } + while (slot < old_slot) { + void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot++); + u64 tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + continue; + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); - if (frag3 >= frag4) - goto next3; + free_data(&ctx, tmp, uspi->s_fpb); + } - /* - * Free last free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic(sb, "ufs_truncate_direct", "internal error"); - frag4 = ufs_fragnum (frag4); - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); + free_data(&ctx, 0, 0); - ufs_free_fragments (inode, tmp, frag4); - next3: + if (old_tail) { + void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot); + u64 tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + ufs_panic(sb, __func__, "internal error"); + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + ufs_free_fragments(inode, tmp, old_tail); + } + } +done: UFSD("EXIT: ino %lu\n", inode->i_ino); } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index c8390976ab6a..38a024c8cccd 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -210,20 +210,18 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) struct inode * inode = d_inode(dentry); struct ufs_dir_entry *de; struct folio *folio; - int err = -ENOENT; + int err; de = ufs_find_entry(dir, &dentry->d_name, &folio); if (!de) - goto out; + return -ENOENT; err = ufs_delete_entry(dir, de, folio); - if (err) - goto out; - - inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); - inode_dec_link_count(inode); - err = 0; -out: + if (!err) { + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); + inode_dec_link_count(inode); + } + folio_release_kmap(folio, de); return err; } @@ -253,14 +251,14 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct ufs_dir_entry * dir_de = NULL; struct folio *old_folio; struct ufs_dir_entry *old_de; - int err = -ENOENT; + int err; if (flags & ~RENAME_NOREPLACE) return -EINVAL; old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_folio); if (!old_de) - goto out; + return -ENOENT; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; @@ -281,7 +279,10 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_folio); if (!new_de) goto out_dir; - ufs_set_link(new_dir, new_de, new_folio, old_inode, 1); + err = ufs_set_link(new_dir, new_de, new_folio, old_inode, 1); + folio_release_kmap(new_folio, new_de); + if (err) + goto out_dir; inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); @@ -299,26 +300,20 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, * rename. */ inode_set_ctime_current(old_inode); - - ufs_delete_entry(old_dir, old_de, old_folio); mark_inode_dirty(old_inode); - if (dir_de) { + err = ufs_delete_entry(old_dir, old_de, old_folio); + if (!err && dir_de) { if (old_dir != new_dir) - ufs_set_link(old_inode, dir_de, dir_folio, new_dir, 0); - else - folio_release_kmap(dir_folio, dir_de); + err = ufs_set_link(old_inode, dir_de, dir_folio, + new_dir, 0); inode_dec_link_count(old_dir); } - return 0; - - out_dir: if (dir_de) folio_release_kmap(dir_folio, dir_de); out_old: folio_release_kmap(old_folio, old_de); -out: return err; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index bc625788589c..762699c1bcf6 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -505,7 +505,6 @@ static int ufs_read_cylinder_structures(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); struct ufs_sb_private_info *uspi = sbi->s_uspi; - struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned size, blks, i; @@ -521,21 +520,13 @@ static int ufs_read_cylinder_structures(struct super_block *sb) if (!base) goto failed; sbi->s_csp = (struct ufs_csum *)space; - for (i = 0; i < blks; i += uspi->s_fpb) { - size = uspi->s_bsize; - if (i + uspi->s_fpb > blks) - size = (blks - i) * uspi->s_fsize; - - ubh = ubh_bread(sb, uspi->s_csaddr + i, size); - - if (!ubh) + for (i = 0; i < blks; i++) { + struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i); + if (!bh) goto failed; - - ubh_ubhcpymem (space, ubh, size); - - space += size; - ubh_brelse (ubh); - ubh = NULL; + memcpy(space, bh->b_data, uspi->s_fsize); + space += uspi->s_fsize; + brelse (bh); } /* @@ -645,7 +636,6 @@ static void ufs_put_super_internal(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); struct ufs_sb_private_info *uspi = sbi->s_uspi; - struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned blks, size, i; @@ -656,18 +646,17 @@ static void ufs_put_super_internal(struct super_block *sb) size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; base = space = (char*) sbi->s_csp; - for (i = 0; i < blks; i += uspi->s_fpb) { - size = uspi->s_bsize; - if (i + uspi->s_fpb > blks) - size = (blks - i) * uspi->s_fsize; - - ubh = ubh_bread(sb, uspi->s_csaddr + i, size); - - ubh_memcpyubh (ubh, space, size); - space += size; - ubh_mark_buffer_uptodate (ubh, 1); - ubh_mark_buffer_dirty (ubh); - ubh_brelse (ubh); + for (i = 0; i < blks; i++, space += uspi->s_fsize) { + struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i); + + if (unlikely(!bh)) { // better than an oops... + ufs_panic(sb, __func__, + "can't write part of cylinder group summary"); + continue; + } + memcpy(bh->b_data, space, uspi->s_fsize); + mark_buffer_dirty(bh); + brelse(bh); } for (i = 0; i < sbi->s_cg_loaded; i++) { ufs_put_cylinder (sb, i); @@ -1240,11 +1229,7 @@ magic_found: else uspi->s_apbshift = uspi->s_bshift - 2; - uspi->s_2apbshift = uspi->s_apbshift * 2; - uspi->s_3apbshift = uspi->s_apbshift * 3; uspi->s_apb = 1 << uspi->s_apbshift; - uspi->s_2apb = 1 << uspi->s_2apbshift; - uspi->s_3apb = 1 << uspi->s_3apbshift; uspi->s_apbmask = uspi->s_apb - 1; uspi->s_nspfshift = uspi->s_fshift - UFS_SECTOR_BITS; uspi->s_nspb = uspi->s_nspf << uspi->s_fpbshift; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index a2c762cb65a0..e7df65dd4351 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -88,10 +88,10 @@ struct ufs_inode_info { #endif /* balloc.c */ -extern void ufs_free_fragments (struct inode *, u64, unsigned); -extern void ufs_free_blocks (struct inode *, u64, unsigned); -extern u64 ufs_new_fragments(struct inode *, void *, u64, u64, - unsigned, int *, struct page *); +void ufs_free_fragments (struct inode *, u64 fragment, unsigned count); +void ufs_free_blocks (struct inode *, u64 fragment, unsigned count); +u64 ufs_new_fragments(struct inode *, void *, u64 fragment, u64 goal, + unsigned count, int *err, struct folio *); /* cylinder.c */ extern struct ufs_cg_private_info * ufs_load_cylinder (struct super_block *, unsigned); @@ -108,8 +108,8 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct folio *); int ufs_empty_dir(struct inode *); struct ufs_dir_entry *ufs_dotdot(struct inode *, struct folio **); -void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct folio *folio, struct inode *inode, bool update_times); +int ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct folio *folio, struct inode *inode, bool update_times); /* file.c */ extern const struct inode_operations ufs_file_inode_operations; diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index ef9ead44776a..0905f9a16b91 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -775,12 +775,8 @@ struct ufs_sb_private_info { __u32 s_fpbmask; /* fragments per block mask */ __u32 s_apb; /* address per block */ - __u32 s_2apb; /* address per block^2 */ - __u32 s_3apb; /* address per block^3 */ __u32 s_apbmask; /* address per block mask */ __u32 s_apbshift; /* address per block shift */ - __u32 s_2apbshift; /* address per block shift * 2 */ - __u32 s_3apbshift; /* address per block shift * 3 */ __u32 s_nspfshift; /* number of sector per fragment shift */ __u32 s_nspb; /* number of sector per block */ __u32 s_inopf; /* inodes per fragment */ diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 2acf191eb89e..f0e906ab4ddd 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -99,20 +99,6 @@ void ubh_mark_buffer_dirty (struct ufs_buffer_head * ubh) mark_buffer_dirty (ubh->bh[i]); } -void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag) -{ - unsigned i; - if (!ubh) - return; - if (flag) { - for ( i = 0; i < ubh->count; i++ ) - set_buffer_uptodate (ubh->bh[i]); - } else { - for ( i = 0; i < ubh->count; i++ ) - clear_buffer_uptodate (ubh->bh[i]); - } -} - void ubh_sync_block(struct ufs_buffer_head *ubh) { if (ubh) { @@ -146,38 +132,6 @@ int ubh_buffer_dirty (struct ufs_buffer_head * ubh) return result; } -void _ubh_ubhcpymem_(struct ufs_sb_private_info * uspi, - unsigned char * mem, struct ufs_buffer_head * ubh, unsigned size) -{ - unsigned len, bhno; - if (size > (ubh->count << uspi->s_fshift)) - size = ubh->count << uspi->s_fshift; - bhno = 0; - while (size) { - len = min_t(unsigned int, size, uspi->s_fsize); - memcpy (mem, ubh->bh[bhno]->b_data, len); - mem += uspi->s_fsize; - size -= len; - bhno++; - } -} - -void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi, - struct ufs_buffer_head * ubh, unsigned char * mem, unsigned size) -{ - unsigned len, bhno; - if (size > (ubh->count << uspi->s_fshift)) - size = ubh->count << uspi->s_fshift; - bhno = 0; - while (size) { - len = min_t(unsigned int, size, uspi->s_fsize); - memcpy (ubh->bh[bhno]->b_data, mem, len); - mem += uspi->s_fsize; - size -= len; - bhno++; - } -} - dev_t ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi) { diff --git a/fs/ufs/util.h b/fs/ufs/util.h index bf708b68f150..391bb4f11d74 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -263,14 +263,9 @@ extern struct ufs_buffer_head * ubh_bread_uspi(struct ufs_sb_private_info *, str extern void ubh_brelse (struct ufs_buffer_head *); extern void ubh_brelse_uspi (struct ufs_sb_private_info *); extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); -extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int); extern void ubh_sync_block(struct ufs_buffer_head *); extern void ubh_bforget (struct ufs_buffer_head *); extern int ubh_buffer_dirty (struct ufs_buffer_head *); -#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size) -extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struct ufs_buffer_head *, unsigned); -#define ubh_memcpyubh(ubh,mem,size) _ubh_memcpyubh_(uspi,ubh,mem,size) -extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned); /* This functions works with cache pages*/ struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index); @@ -455,65 +450,69 @@ static inline unsigned _ubh_find_last_zero_bit_( return (base << uspi->s_bpfshift) + pos - begin; } -#define ubh_isblockclear(ubh,begin,block) (!_ubh_isblockset_(uspi,ubh,begin,block)) - -#define ubh_isblockset(ubh,begin,block) _ubh_isblockset_(uspi,ubh,begin,block) -static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi, - struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +static inline int ubh_isblockset(struct ufs_sb_private_info *uspi, + struct ufs_cg_private_info *ucpi, unsigned int frag) { + struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); + u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); u8 mask; + switch (uspi->s_fpb) { case 8: - return (*ubh_get_addr (ubh, begin + block) == 0xff); + return *p == 0xff; case 4: - mask = 0x0f << ((block & 0x01) << 2); - return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask; + mask = 0x0f << (frag & 4); + return (*p & mask) == mask; case 2: - mask = 0x03 << ((block & 0x03) << 1); - return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask; + mask = 0x03 << (frag & 6); + return (*p & mask) == mask; case 1: - mask = 0x01 << (block & 0x07); - return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask; + mask = 0x01 << (frag & 7); + return (*p & mask) == mask; } return 0; } -#define ubh_clrblock(ubh,begin,block) _ubh_clrblock_(uspi,ubh,begin,block) -static inline void _ubh_clrblock_(struct ufs_sb_private_info * uspi, - struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +static inline void ubh_clrblock(struct ufs_sb_private_info *uspi, + struct ufs_cg_private_info *ucpi, unsigned int frag) { + struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); + u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); + switch (uspi->s_fpb) { case 8: - *ubh_get_addr (ubh, begin + block) = 0x00; + *p = 0x00; return; case 4: - *ubh_get_addr (ubh, begin + (block >> 1)) &= ~(0x0f << ((block & 0x01) << 2)); + *p &= ~(0x0f << (frag & 4)); return; case 2: - *ubh_get_addr (ubh, begin + (block >> 2)) &= ~(0x03 << ((block & 0x03) << 1)); + *p &= ~(0x03 << (frag & 6)); return; case 1: - *ubh_get_addr (ubh, begin + (block >> 3)) &= ~(0x01 << ((block & 0x07))); + *p &= ~(0x01 << (frag & 7)); return; } } -#define ubh_setblock(ubh,begin,block) _ubh_setblock_(uspi,ubh,begin,block) -static inline void _ubh_setblock_(struct ufs_sb_private_info * uspi, - struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +static inline void ubh_setblock(struct ufs_sb_private_info * uspi, + struct ufs_cg_private_info *ucpi, unsigned int frag) { + struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); + u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); + switch (uspi->s_fpb) { case 8: - *ubh_get_addr(ubh, begin + block) = 0xff; + *p = 0xff; return; case 4: - *ubh_get_addr(ubh, begin + (block >> 1)) |= (0x0f << ((block & 0x01) << 2)); + *p |= 0x0f << (frag & 4); return; case 2: - *ubh_get_addr(ubh, begin + (block >> 2)) |= (0x03 << ((block & 0x03) << 1)); + *p |= 0x03 << (frag & 6); return; case 1: - *ubh_get_addr(ubh, begin + (block >> 3)) |= (0x01 << ((block & 0x07))); + *p |= 0x01 << (frag & 7); return; } } diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c index 8395066341a4..7f7cb14e01ce 100644 --- a/fs/unicode/utf8-core.c +++ b/fs/unicode/utf8-core.c @@ -214,3 +214,29 @@ void utf8_unload(struct unicode_map *um) } EXPORT_SYMBOL(utf8_unload); +/** + * utf8_parse_version - Parse a UTF-8 version number from a string + * + * @version: input string + * + * Returns the parsed version on success, negative code on error + */ +int utf8_parse_version(char *version) +{ + substring_t args[3]; + unsigned int maj, min, rev; + static const struct match_token token[] = { + {1, "%d.%d.%d"}, + {0, NULL} + }; + + if (match_token(version, token, args) != 1) + return -EINVAL; + + if (match_int(&args[0], &maj) || match_int(&args[1], &min) || + match_int(&args[2], &rev)) + return -EINVAL; + + return UNICODE_AGE(maj, min, rev); +} +EXPORT_SYMBOL(utf8_parse_version); diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index 600e15efe9ed..5ddaf27b21a6 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -17,9 +17,6 @@ static unsigned int failed_tests; static unsigned int total_tests; -/* Tests will be based on this version. */ -#define UTF8_LATEST UNICODE_AGE(12, 1, 0) - #define _test(cond, func, line, fmt, ...) do { \ total_tests++; \ if (!cond) { \ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 68cdd89c97a3..7c0bd0b55f88 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs) } } +void dup_userfaultfd_fail(struct list_head *fcs) +{ + struct userfaultfd_fork_ctx *fctx, *n; + + /* + * An error has occurred on fork, we will tear memory down, but have + * allocated memory for fctx's and raised reference counts for both the + * original and child contexts (and on the mm for each as a result). + * + * These would ordinarily be taken care of by a user handling the event, + * but we are no longer doing so, so manually clean up here. + * + * mm tear down will take care of cleaning up VMA contexts. + */ + list_for_each_entry_safe(fctx, n, fcs, list) { + struct userfaultfd_ctx *octx = fctx->orig; + struct userfaultfd_ctx *ctx = fctx->new; + + atomic_dec(&octx->mmap_changing); + VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0); + userfaultfd_ctx_put(octx); + userfaultfd_ctx_put(ctx); + + list_del(&fctx->list); + kfree(fctx); + } +} + void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *vm_ctx) { diff --git a/fs/utimes.c b/fs/utimes.c index 99b26f792b89..c7c7958e57b2 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -108,18 +108,13 @@ retry: static int do_utimes_fd(int fd, struct timespec64 *times, int flags) { - struct fd f; - int error; - if (flags) return -EINVAL; - f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return -EBADF; - error = vfs_utimes(&fd_file(f)->f_path, times); - fdput(f); - return error; + return vfs_utimes(&fd_file(f)->f_path, times); } /* diff --git a/fs/xattr.c b/fs/xattr.c index 05ec7e7d9e87..02bee149ad96 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -586,25 +586,32 @@ retry_deleg: } EXPORT_SYMBOL_GPL(vfs_removexattr); +int import_xattr_name(struct xattr_name *kname, const char __user *name) +{ + int error = strncpy_from_user(kname->name, name, + sizeof(kname->name)); + if (error == 0 || error == sizeof(kname->name)) + return -ERANGE; + if (error < 0) + return error; + return 0; +} + /* * Extended attribute SET operations */ -int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) +int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx) { int error; if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; - error = strncpy_from_user(ctx->kname->name, name, - sizeof(ctx->kname->name)); - if (error == 0 || error == sizeof(ctx->kname->name)) - return -ERANGE; - if (error < 0) + error = import_xattr_name(ctx->kname, name); + if (error) return error; - error = 0; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) return -E2BIG; @@ -619,8 +626,8 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) return error; } -int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct xattr_ctx *ctx) +static int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct kernel_xattr_ctx *ctx) { if (is_posix_acl_xattr(ctx->kname->name)) return do_set_acl(idmap, dentry, ctx->kname->name, @@ -630,32 +637,32 @@ int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, ctx->kvalue, ctx->size, ctx->flags); } -static int path_setxattr(const char __user *pathname, - const char __user *name, const void __user *value, - size_t size, int flags, unsigned int lookup_flags) +int file_setxattr(struct file *f, struct kernel_xattr_ctx *ctx) +{ + int error = mnt_want_write_file(f); + + if (!error) { + audit_file(f); + error = do_setxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); + mnt_drop_write_file(f); + } + return error; +} + +/* unconditionally consumes filename */ +int filename_setxattr(int dfd, struct filename *filename, + unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) { - struct xattr_name kname; - struct xattr_ctx ctx = { - .cvalue = value, - .kvalue = NULL, - .size = size, - .kname = &kname, - .flags = flags, - }; struct path path; int error; - error = setxattr_copy(name, &ctx); - if (error) - return error; - retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = mnt_want_write(path.mnt); if (!error) { - error = do_setxattr(mnt_idmap(path.mnt), path.dentry, &ctx); + error = do_setxattr(mnt_idmap(path.mnt), path.dentry, ctx); mnt_drop_write(path.mnt); } path_put(&path); @@ -665,80 +672,121 @@ retry: } out: + putname(filename); + return error; +} + +static int path_setxattrat(int dfd, const char __user *pathname, + unsigned int at_flags, const char __user *name, + const void __user *value, size_t size, int flags) +{ + struct xattr_name kname; + struct kernel_xattr_ctx ctx = { + .cvalue = value, + .kvalue = NULL, + .size = size, + .kname = &kname, + .flags = flags, + }; + struct filename *filename; + unsigned int lookup_flags = 0; + int error; + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + if (!(at_flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags = LOOKUP_FOLLOW; + + error = setxattr_copy(name, &ctx); + if (error) + return error; + + filename = getname_maybe_null(pathname, at_flags); + if (!filename) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + error = -EBADF; + else + error = file_setxattr(fd_file(f), &ctx); + } else { + error = filename_setxattr(dfd, filename, lookup_flags, &ctx); + } kvfree(ctx.kvalue); return error; } +SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, + const char __user *, name, const struct xattr_args __user *, uargs, + size_t, usize) +{ + struct xattr_args args = {}; + int error; + + BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); + + if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) + return -EINVAL; + if (usize > PAGE_SIZE) + return -E2BIG; + + error = copy_struct_from_user(&args, sizeof(args), uargs, usize); + if (error) + return error; + + return path_setxattrat(dfd, pathname, at_flags, name, + u64_to_user_ptr(args.value), args.size, + args.flags); +} + SYSCALL_DEFINE5(setxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { - return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW); + return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags); } SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { - return path_setxattr(pathname, name, value, size, flags, 0); + return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, + value, size, flags); } SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { - struct xattr_name kname; - struct xattr_ctx ctx = { - .cvalue = value, - .kvalue = NULL, - .size = size, - .kname = &kname, - .flags = flags, - }; - int error; - - CLASS(fd, f)(fd); - if (!fd_file(f)) - return -EBADF; - - audit_file(fd_file(f)); - error = setxattr_copy(name, &ctx); - if (error) - return error; - - error = mnt_want_write_file(fd_file(f)); - if (!error) { - error = do_setxattr(file_mnt_idmap(fd_file(f)), - fd_file(f)->f_path.dentry, &ctx); - mnt_drop_write_file(fd_file(f)); - } - kvfree(ctx.kvalue); - return error; + return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name, + value, size, flags); } /* * Extended attribute GET operations */ -ssize_t +static ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, - struct xattr_ctx *ctx) + struct kernel_xattr_ctx *ctx) { ssize_t error; char *kname = ctx->kname->name; + void *kvalue = NULL; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) ctx->size = XATTR_SIZE_MAX; - ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL); - if (!ctx->kvalue) + kvalue = kvzalloc(ctx->size, GFP_KERNEL); + if (!kvalue) return -ENOMEM; } - if (is_posix_acl_xattr(ctx->kname->name)) - error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size); + if (is_posix_acl_xattr(kname)) + error = do_get_acl(idmap, d, kname, kvalue, ctx->size); else - error = vfs_getxattr(idmap, d, kname, ctx->kvalue, ctx->size); + error = vfs_getxattr(idmap, d, kname, kvalue, ctx->size); if (error > 0) { - if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) + if (ctx->size && copy_to_user(ctx->value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger @@ -746,79 +794,114 @@ do_getxattr(struct mnt_idmap *idmap, struct dentry *d, error = -E2BIG; } + kvfree(kvalue); return error; } -static ssize_t -getxattr(struct mnt_idmap *idmap, struct dentry *d, - const char __user *name, void __user *value, size_t size) +ssize_t file_getxattr(struct file *f, struct kernel_xattr_ctx *ctx) { + audit_file(f); + return do_getxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); +} + +/* unconditionally consumes filename */ +ssize_t filename_getxattr(int dfd, struct filename *filename, + unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) +{ + struct path path; ssize_t error; +retry: + error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); + if (error) + goto out; + error = do_getxattr(mnt_idmap(path.mnt), path.dentry, ctx); + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +out: + putname(filename); + return error; +} + +static ssize_t path_getxattrat(int dfd, const char __user *pathname, + unsigned int at_flags, const char __user *name, + void __user *value, size_t size) +{ struct xattr_name kname; - struct xattr_ctx ctx = { + struct kernel_xattr_ctx ctx = { .value = value, - .kvalue = NULL, .size = size, .kname = &kname, .flags = 0, }; + struct filename *filename; + ssize_t error; - error = strncpy_from_user(kname.name, name, sizeof(kname.name)); - if (error == 0 || error == sizeof(kname.name)) - error = -ERANGE; - if (error < 0) - return error; + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; - error = do_getxattr(idmap, d, &ctx); + error = import_xattr_name(&kname, name); + if (error) + return error; - kvfree(ctx.kvalue); - return error; + filename = getname_maybe_null(pathname, at_flags); + if (!filename) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + return -EBADF; + return file_getxattr(fd_file(f), &ctx); + } else { + int lookup_flags = 0; + if (!(at_flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags = LOOKUP_FOLLOW; + return filename_getxattr(dfd, filename, lookup_flags, &ctx); + } } -static ssize_t path_getxattr(const char __user *pathname, - const char __user *name, void __user *value, - size_t size, unsigned int lookup_flags) +SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, + const char __user *, name, struct xattr_args __user *, uargs, size_t, usize) { - struct path path; - ssize_t error; -retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + struct xattr_args args = {}; + int error; + + BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); + + if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) + return -EINVAL; + if (usize > PAGE_SIZE) + return -E2BIG; + + error = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (error) return error; - error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size); - path_put(&path); - if (retry_estale(error, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - return error; + + if (args.flags != 0) + return -EINVAL; + + return path_getxattrat(dfd, pathname, at_flags, name, + u64_to_user_ptr(args.value), args.size); } SYSCALL_DEFINE4(getxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { - return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW); + return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size); } SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { - return path_getxattr(pathname, name, value, size, 0); + return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, + value, size); } SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { - struct fd f = fdget(fd); - ssize_t error = -EBADF; - - if (!fd_file(f)) - return error; - audit_file(fd_file(f)); - error = getxattr(file_mnt_idmap(fd_file(f)), fd_file(f)->f_path.dentry, - name, value, size); - fdput(f); - return error; + return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size); } /* @@ -853,47 +936,80 @@ listxattr(struct dentry *d, char __user *list, size_t size) return error; } -static ssize_t path_listxattr(const char __user *pathname, char __user *list, - size_t size, unsigned int lookup_flags) +static +ssize_t file_listxattr(struct file *f, char __user *list, size_t size) +{ + audit_file(f); + return listxattr(f->f_path.dentry, list, size); +} + +/* unconditionally consumes filename */ +static +ssize_t filename_listxattr(int dfd, struct filename *filename, + unsigned int lookup_flags, + char __user *list, size_t size) { struct path path; ssize_t error; retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) - return error; + goto out; error = listxattr(path.dentry, list, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } +out: + putname(filename); return error; } +static ssize_t path_listxattrat(int dfd, const char __user *pathname, + unsigned int at_flags, char __user *list, + size_t size) +{ + struct filename *filename; + int lookup_flags; + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + filename = getname_maybe_null(pathname, at_flags); + if (!filename) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + return -EBADF; + return file_listxattr(fd_file(f), list, size); + } + + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + return filename_listxattr(dfd, filename, lookup_flags, list, size); +} + +SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname, + unsigned int, at_flags, + char __user *, list, size_t, size) +{ + return path_listxattrat(dfd, pathname, at_flags, list, size); +} + SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, size_t, size) { - return path_listxattr(pathname, list, size, LOOKUP_FOLLOW); + return path_listxattrat(AT_FDCWD, pathname, 0, list, size); } SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, size_t, size) { - return path_listxattr(pathname, list, size, 0); + return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size); } SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { - struct fd f = fdget(fd); - ssize_t error = -EBADF; - - if (!fd_file(f)) - return error; - audit_file(fd_file(f)); - error = listxattr(fd_file(f)->f_path.dentry, list, size); - fdput(f); - return error; + return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size); } /* @@ -907,25 +1023,33 @@ removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name) return vfs_removexattr(idmap, d, name); } -static int path_removexattr(const char __user *pathname, - const char __user *name, unsigned int lookup_flags) +static int file_removexattr(struct file *f, struct xattr_name *kname) +{ + int error = mnt_want_write_file(f); + + if (!error) { + audit_file(f); + error = removexattr(file_mnt_idmap(f), + f->f_path.dentry, kname->name); + mnt_drop_write_file(f); + } + return error; +} + +/* unconditionally consumes filename */ +static int filename_removexattr(int dfd, struct filename *filename, + unsigned int lookup_flags, struct xattr_name *kname) { struct path path; int error; - char kname[XATTR_NAME_MAX + 1]; - error = strncpy_from_user(kname, name, sizeof(kname)); - if (error == 0 || error == sizeof(kname)) - error = -ERANGE; - if (error < 0) - return error; retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) - return error; + goto out; error = mnt_want_write(path.mnt); if (!error) { - error = removexattr(mnt_idmap(path.mnt), path.dentry, kname); + error = removexattr(mnt_idmap(path.mnt), path.dentry, kname->name); mnt_drop_write(path.mnt); } path_put(&path); @@ -933,45 +1057,58 @@ retry: lookup_flags |= LOOKUP_REVAL; goto retry; } +out: + putname(filename); return error; } +static int path_removexattrat(int dfd, const char __user *pathname, + unsigned int at_flags, const char __user *name) +{ + struct xattr_name kname; + struct filename *filename; + unsigned int lookup_flags; + int error; + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + error = import_xattr_name(&kname, name); + if (error) + return error; + + filename = getname_maybe_null(pathname, at_flags); + if (!filename) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + return -EBADF; + return file_removexattr(fd_file(f), &kname); + } + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + return filename_removexattr(dfd, filename, lookup_flags, &kname); +} + +SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname, + unsigned int, at_flags, const char __user *, name) +{ + return path_removexattrat(dfd, pathname, at_flags, name); +} + SYSCALL_DEFINE2(removexattr, const char __user *, pathname, const char __user *, name) { - return path_removexattr(pathname, name, LOOKUP_FOLLOW); + return path_removexattrat(AT_FDCWD, pathname, 0, name); } SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, const char __user *, name) { - return path_removexattr(pathname, name, 0); + return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name); } SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { - struct fd f = fdget(fd); - char kname[XATTR_NAME_MAX + 1]; - int error = -EBADF; - - if (!fd_file(f)) - return error; - audit_file(fd_file(f)); - - error = strncpy_from_user(kname, name, sizeof(kname)); - if (error == 0 || error == sizeof(kname)) - error = -ERANGE; - if (error < 0) - return error; - - error = mnt_want_write_file(fd_file(f)); - if (!error) { - error = removexattr(file_mnt_idmap(fd_file(f)), - fd_file(f)->f_path.dentry, kname); - mnt_drop_write_file(fd_file(f)); - } - fdput(f); - return error; + return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name); } int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) @@ -1005,9 +1142,10 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr; ssize_t remaining_size = buffer_size; - int err = 0; for_each_xattr_handler(handlers, handler) { + int err; + if (!handler->name || (handler->list && !handler->list(dentry))) continue; err = xattr_list_one(&buffer, &remaining_size, handler->name); @@ -1015,7 +1153,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) return err; } - return err ? err : buffer_size - remaining_size; + return buffer_size - remaining_size; } EXPORT_SYMBOL(generic_listxattr); diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 5f0494702e0b..5ca8d0106827 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -185,17 +185,20 @@ out: } /* - * Free up the per-ag resources associated with the mount structure. + * Free up the per-ag resources within the specified AG range. */ void -xfs_free_perag( - struct xfs_mount *mp) +xfs_free_perag_range( + struct xfs_mount *mp, + xfs_agnumber_t first_agno, + xfs_agnumber_t end_agno) + { - struct xfs_perag *pag; xfs_agnumber_t agno; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xa_erase(&mp->m_perags, agno); + for (agno = first_agno; agno < end_agno; agno++) { + struct xfs_perag *pag = xa_erase(&mp->m_perags, agno); + ASSERT(pag); XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); xfs_defer_drain_free(&pag->pag_intents_drain); @@ -270,54 +273,37 @@ xfs_agino_range( return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); } -/* - * Free perag within the specified AG range, it is only used to free unused - * perags under the error handling path. - */ -void -xfs_free_unused_perag_range( +int +xfs_update_last_ag_size( struct xfs_mount *mp, - xfs_agnumber_t agstart, - xfs_agnumber_t agend) + xfs_agnumber_t prev_agcount) { - struct xfs_perag *pag; - xfs_agnumber_t index; + struct xfs_perag *pag = xfs_perag_grab(mp, prev_agcount - 1); - for (index = agstart; index < agend; index++) { - pag = xa_erase(&mp->m_perags, index); - if (!pag) - break; - xfs_buf_cache_destroy(&pag->pag_bcache); - xfs_defer_drain_free(&pag->pag_intents_drain); - kfree(pag); - } + if (!pag) + return -EFSCORRUPTED; + pag->block_count = __xfs_ag_block_count(mp, prev_agcount - 1, + mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks); + __xfs_agino_range(mp, pag->block_count, &pag->agino_min, + &pag->agino_max); + xfs_perag_rele(pag); + return 0; } int xfs_initialize_perag( struct xfs_mount *mp, - xfs_agnumber_t agcount, + xfs_agnumber_t old_agcount, + xfs_agnumber_t new_agcount, xfs_rfsblock_t dblocks, xfs_agnumber_t *maxagi) { struct xfs_perag *pag; xfs_agnumber_t index; - xfs_agnumber_t first_initialised = NULLAGNUMBER; int error; - /* - * Walk the current per-ag tree so we don't try to initialise AGs - * that already exist (growfs case). Allocate and insert all the - * AGs we don't find ready for initialisation. - */ - for (index = 0; index < agcount; index++) { - pag = xfs_perag_get(mp, index); - if (pag) { - xfs_perag_put(pag); - continue; - } - - pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL); + for (index = old_agcount; index < new_agcount; index++) { + pag = kzalloc(sizeof(*pag), GFP_KERNEL); if (!pag) { error = -ENOMEM; goto out_unwind_new_pags; @@ -353,21 +339,17 @@ xfs_initialize_perag( /* Active ref owned by mount indicates AG is online. */ atomic_set(&pag->pag_active_ref, 1); - /* first new pag is fully initialized */ - if (first_initialised == NULLAGNUMBER) - first_initialised = index; - /* * Pre-calculated geometry */ - pag->block_count = __xfs_ag_block_count(mp, index, agcount, + pag->block_count = __xfs_ag_block_count(mp, index, new_agcount, dblocks); pag->min_block = XFS_AGFL_BLOCK(mp); __xfs_agino_range(mp, pag->block_count, &pag->agino_min, &pag->agino_max); } - index = xfs_set_inode_alloc(mp, agcount); + index = xfs_set_inode_alloc(mp, new_agcount); if (maxagi) *maxagi = index; @@ -381,8 +363,7 @@ out_remove_pag: out_free_pag: kfree(pag); out_unwind_new_pags: - /* unwind any prior newly initialized pags */ - xfs_free_unused_perag_range(mp, first_initialised, agcount); + xfs_free_perag_range(mp, old_agcount, index); return error; } diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index d9cccd093b60..9edfe0e96439 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -144,12 +144,13 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) __XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) -void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart, - xfs_agnumber_t agend); -int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, - xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); +int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t old_agcount, + xfs_agnumber_t agcount, xfs_rfsblock_t dcount, + xfs_agnumber_t *maxagi); +void xfs_free_perag_range(struct xfs_mount *mp, xfs_agnumber_t first_agno, + xfs_agnumber_t end_agno); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); -void xfs_free_perag(struct xfs_mount *mp); +int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount); /* Passive AG references */ struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 04f64cf9777e..22bdbb3e9980 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1923,7 +1923,7 @@ restart: error = -EFSCORRUPTED; goto error0; } - if (flen < bestrlen) + if (flen <= bestrlen) break; busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 3c40f37e82c7..c962ad64b0c1 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -62,12 +62,12 @@ xfs_trans_ichgtime( ASSERT(tp); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - tv = current_time(inode); + /* If the mtime changes, then ctime must also change */ + ASSERT(flags & XFS_ICHGTIME_CHG); + tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode_set_mtime_to_ts(inode, tv); - if (flags & XFS_ICHGTIME_CHG) - inode_set_ctime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_ACCESS) inode_set_atime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c index 49dc38acc66b..4505f4829d53 100644 --- a/fs/xfs/scrub/bmap_repair.c +++ b/fs/xfs/scrub/bmap_repair.c @@ -801,7 +801,7 @@ xrep_bmap( { struct xrep_bmap *rb; char *descr; - unsigned int max_bmbt_recs; + xfs_extnum_t max_bmbt_recs; bool large_extcount; int error = 0; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 67478294f11a..155bbaaa496e 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1084,9 +1084,11 @@ xrep_metadata_inode_forks( return error; /* Make sure the attr fork looks ok before we delete it. */ - error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); - if (error) - return error; + if (xfs_inode_hasattr(sc->ip)) { + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error) + return error; + } /* Clear the reflink flag since metadata never shares. */ if (xfs_is_reflink_inode(sc->ip)) { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6dead20338e2..559a3a577097 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -116,7 +116,7 @@ xfs_end_ioend( if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); - xfs_bmap_punch_delalloc_range(ip, offset, + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, offset + size); } goto done; @@ -456,7 +456,7 @@ xfs_discard_folio( * byte of the next folio. Hence the end offset is only dependent on the * folio itself and not the start offset that is passed in. */ - xfs_bmap_punch_delalloc_range(ip, pos, + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, folio_pos(folio) + folio_size(folio)); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 053d567c9108..4719ec90029c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -442,11 +442,12 @@ out_unlock_iolock: void xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, + int whichfork, xfs_off_t start_byte, xfs_off_t end_byte) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = &ip->i_df; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; @@ -474,11 +475,14 @@ xfs_bmap_punch_delalloc_range( continue; } - xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } + if (whichfork == XFS_COW_FORK && !ifp->if_bytes) + xfs_inode_clear_cowblocks_tag(ip); + out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); } @@ -580,7 +584,7 @@ xfs_free_eofblocks( */ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) { if (ip->i_delayed_blks) { - xfs_bmap_punch_delalloc_range(ip, + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), LLONG_MAX); } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index eb0895bfb9da..b29760d36e1a 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -30,7 +30,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) } #endif /* CONFIG_XFS_RT */ -void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, +void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork, xfs_off_t start_byte, xfs_off_t end_byte); struct kgetbmap { diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index aa4dbda7b536..e8196f5778e2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2115,6 +2115,13 @@ xfs_alloc_buftarg( btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); + if (bdev_can_atomic_write(btp->bt_bdev)) { + btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes( + btp->bt_bdev); + btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes( + btp->bt_bdev); + } + /* * When allocating the buftargs we have not yet read the super block and * thus don't know the file system sector size yet. diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 209a389f2abc..3d56bc7a35cc 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -124,6 +124,10 @@ struct xfs_buftarg { struct percpu_counter bt_io_count; struct ratelimit_state bt_ioerror_rl; + /* Atomic write unit values */ + unsigned int bt_bdev_awu_min; + unsigned int bt_bdev_awu_max; + /* built-in cache, if we're not using the perag one */ struct xfs_buf_cache bt_cache[]; }; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 09e893cf563c..5180cbf5a90b 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -22,6 +22,9 @@ #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_quota.h" +#include "xfs_alloc.h" +#include "xfs_ag.h" +#include "xfs_sb.h" /* * This is the number of entries in the l_buf_cancel_table used during @@ -685,6 +688,67 @@ xlog_recover_do_inode_buffer( } /* + * Update the in-memory superblock and perag structures from the primary SB + * buffer. + * + * This is required because transactions running after growfs may require the + * updated values to be set in a previous fully commit transaction. + */ +static int +xlog_recover_do_primary_sb_buffer( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f, + xfs_lsn_t current_lsn) +{ + struct xfs_dsb *dsb = bp->b_addr; + xfs_agnumber_t orig_agcount = mp->m_sb.sb_agcount; + int error; + + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); + + if (orig_agcount == 0) { + xfs_alert(mp, "Trying to grow file system without AGs"); + return -EFSCORRUPTED; + } + + /* + * Update the in-core super block from the freshly recovered on-disk one. + */ + xfs_sb_from_disk(&mp->m_sb, dsb); + + if (mp->m_sb.sb_agcount < orig_agcount) { + xfs_alert(mp, "Shrinking AG count in log recovery not supported"); + return -EFSCORRUPTED; + } + + /* + * Growfs can also grow the last existing AG. In this case we also need + * to update the length in the in-core perag structure and values + * depending on it. + */ + error = xfs_update_last_ag_size(mp, orig_agcount); + if (error) + return error; + + /* + * Initialize the new perags, and also update various block and inode + * allocator setting based off the number of AGs or total blocks. + * Because of the latter this also needs to happen if the agcount did + * not change. + */ + error = xfs_initialize_perag(mp, orig_agcount, mp->m_sb.sb_agcount, + mp->m_sb.sb_dblocks, &mp->m_maxagi); + if (error) { + xfs_warn(mp, "Failed recovery per-ag init: %d", error); + return error; + } + mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + return 0; +} + +/* * V5 filesystems know the age of the buffer on disk being recovered. We can * have newer objects on disk than we are replaying, and so for these cases we * don't want to replay the current change as that will make the buffer contents @@ -967,6 +1031,12 @@ xlog_recover_buf_commit_pass2( dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); if (!dirty) goto out_release; + } else if ((xfs_blft_from_flags(buf_f) & XFS_BLFT_SB_BUF) && + xfs_buf_daddr(bp) == 0) { + error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f, + current_lsn); + if (error) + goto out_release; } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); } diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c index 75cb53f090d1..fa29c8b334d2 100644 --- a/fs/xfs/xfs_exchrange.c +++ b/fs/xfs/xfs_exchrange.c @@ -813,8 +813,6 @@ xfs_ioc_exchange_range( .file2 = file, }; struct xfs_exchange_range args; - struct fd file1; - int error; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; @@ -828,14 +826,12 @@ xfs_ioc_exchange_range( fxr.length = args.length; fxr.flags = args.flags; - file1 = fdget(args.file1_fd); - if (!fd_file(file1)) + CLASS(fd, file1)(args.file1_fd); + if (fd_empty(file1)) return -EBADF; fxr.file1 = fd_file(file1); - error = xfs_exchange_range(&fxr); - fdput(file1); - return error; + return xfs_exchange_range(&fxr); } /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */ @@ -909,8 +905,6 @@ xfs_ioc_commit_range( struct xfs_commit_range_fresh *kern_f; struct xfs_inode *ip2 = XFS_I(file_inode(file)); struct xfs_mount *mp = ip2->i_mount; - struct fd file1; - int error; kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; @@ -934,12 +928,10 @@ xfs_ioc_commit_range( fxr.file2_ctime.tv_sec = kern_f->file2_ctime; fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec; - file1 = fdget(args.file1_fd); + CLASS(fd, file1)(args.file1_fd); if (fd_empty(file1)) return -EBADF; fxr.file1 = fd_file(file1); - error = xfs_exchange_range(&fxr); - fdput(file1); - return error; + return xfs_exchange_range(&fxr); } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 412b1d71b52b..ca47cae5a40a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -348,9 +348,82 @@ xfs_file_splice_read( } /* + * Take care of zeroing post-EOF blocks when they might exist. + * + * Returns 0 if successfully, a negative error for a failure, or 1 if this + * function dropped the iolock and reacquired it exclusively and the caller + * needs to restart the write sanity checks. + */ +static ssize_t +xfs_file_write_zero_eof( + struct kiocb *iocb, + struct iov_iter *from, + unsigned int *iolock, + size_t count, + bool *drained_dio) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + loff_t isize; + int error; + + /* + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while + * we do this check until we have placed an IO barrier (i.e. hold + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. + */ + spin_lock(&ip->i_flags_lock); + isize = i_size_read(VFS_I(ip)); + if (iocb->ki_pos <= isize) { + spin_unlock(&ip->i_flags_lock); + return 0; + } + spin_unlock(&ip->i_flags_lock); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + if (!*drained_dio) { + /* + * If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies + * having to redo all checks before. + */ + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + } + + /* + * We now have an IO submission barrier in place, but AIO can do + * EOF updates during IO completion and hence we now need to + * wait for all of them to drain. Non-AIO DIO will have drained + * before we are given the XFS_IOLOCK_EXCL, and so for most + * cases this wait is a no-op. + */ + inode_dio_wait(VFS_I(ip)); + *drained_dio = true; + return 1; + } + + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + + return error; +} + +/* * Common pre-write limit and setup checks. * - * Called with the iolocked held either shared and exclusive according to + * Called with the iolock held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ @@ -360,13 +433,10 @@ xfs_file_write_checks( struct iov_iter *from, unsigned int *iolock) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - ssize_t error = 0; + struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); bool drained_dio = false; - loff_t isize; + ssize_t error; restart: error = generic_write_checks(iocb, from); @@ -389,7 +459,7 @@ restart: * exclusively. */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_iunlock(ip, *iolock); + xfs_iunlock(XFS_I(inode), *iolock); *iolock = XFS_IOLOCK_EXCL; error = xfs_ilock_iocb(iocb, *iolock); if (error) { @@ -400,64 +470,24 @@ restart: } /* - * If the offset is beyond the size of the file, we need to zero any + * If the offset is beyond the size of the file, we need to zero all * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the iolock - * shared, we need to update it to exclusive which implies having to - * redo all checks before. - * - * We need to serialise against EOF updates that occur in IO completions - * here. We want to make sure that nobody is changing the size while we - * do this check until we have placed an IO barrier (i.e. hold the - * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The - * spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and - * hence be able to correctly determine if we need to run zeroing. + * write. * - * We can do an unlocked check here safely as IO completion can only - * extend EOF. Truncate is locked out at this point, so the EOF can - * not move backwards, only forwards. Hence we only need to take the - * slow path and spin locks when we are at or beyond the current EOF. + * We can do an unlocked check for i_size here safely as I/O completion + * can only extend EOF. Truncate is locked out at this point, so the + * EOF can not move backwards, only forwards. Hence we only need to take + * the slow path when we are at or beyond the current EOF. */ - if (iocb->ki_pos <= i_size_read(inode)) - goto out; - - spin_lock(&ip->i_flags_lock); - isize = i_size_read(inode); - if (iocb->ki_pos > isize) { - spin_unlock(&ip->i_flags_lock); - - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - - if (!drained_dio) { - if (*iolock == XFS_IOLOCK_SHARED) { - xfs_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, *iolock); - iov_iter_reexpand(from, count); - } - /* - * We now have an IO submission barrier in place, but - * AIO can do EOF updates during IO completion and hence - * we now need to wait for all of them to drain. Non-AIO - * DIO will have drained before we are given the - * XFS_IOLOCK_EXCL, and so for most cases this wait is a - * no-op. - */ - inode_dio_wait(inode); - drained_dio = true; + if (iocb->ki_pos > i_size_read(inode)) { + error = xfs_file_write_zero_eof(iocb, from, iolock, count, + &drained_dio); + if (error == 1) goto restart; - } - - trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) return error; - } else - spin_unlock(&ip->i_flags_lock); + } -out: return kiocb_modified(iocb); } @@ -822,6 +852,20 @@ xfs_file_write_iter( if (IS_DAX(inode)) return xfs_file_dax_write(iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * Currently only atomic writing of a single FS block is + * supported. It would be possible to atomic write smaller than + * a FS block, but there is no requirement to support this. + * Note that iomap also does not support this yet. + */ + if (ocount != ip->i_mount->m_sb.sb_blocksize) + return -EINVAL; + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered @@ -1209,6 +1253,8 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; + if (xfs_inode_can_atomicwrite(XFS_I(inode))) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index e3aaa0555597..290ba8887d29 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -64,25 +64,31 @@ xfs_filestream_pick_ag( struct xfs_perag *pag; struct xfs_perag *max_pag = NULL; xfs_extlen_t minlen = *longest; - xfs_extlen_t free = 0, minfree, maxfree = 0; + xfs_extlen_t minfree, maxfree = 0; xfs_agnumber_t agno; bool first_pass = true; - int err; /* 2% of an AG's blocks must be free for it to be chosen. */ minfree = mp->m_sb.sb_agblocks / 50; restart: for_each_perag_wrap(mp, start_agno, agno, pag) { + int err; + trace_xfs_filestream_scan(pag, pino); + *longest = 0; err = xfs_bmap_longest_free_extent(pag, NULL, longest); if (err) { - if (err != -EAGAIN) - break; - /* Couldn't lock the AGF, skip this AG. */ - err = 0; - continue; + if (err == -EAGAIN) { + /* Couldn't lock the AGF, skip this AG. */ + err = 0; + continue; + } + xfs_perag_rele(pag); + if (max_pag) + xfs_perag_rele(max_pag); + return err; } /* Keep track of the AG with the most free blocks. */ @@ -107,8 +113,9 @@ restart: !(flags & XFS_PICK_USERDATA) || (flags & XFS_PICK_LOWSPACE))) { /* Break out, retaining the reference on the AG. */ - free = pag->pagf_freeblks; - break; + if (max_pag) + xfs_perag_rele(max_pag); + goto done; } } @@ -116,57 +123,47 @@ restart: atomic_dec(&pag->pagf_fstrms); } - if (err) { - xfs_perag_rele(pag); - if (max_pag) - xfs_perag_rele(max_pag); - return err; + /* + * Allow a second pass to give xfs_bmap_longest_free_extent() another + * attempt at locking AGFs that it might have skipped over before we + * fail. + */ + if (first_pass) { + first_pass = false; + goto restart; } - if (!pag) { - /* - * Allow a second pass to give xfs_bmap_longest_free_extent() - * another attempt at locking AGFs that it might have skipped - * over before we fail. - */ - if (first_pass) { - first_pass = false; - goto restart; - } + /* + * We must be low on data space, so run a final lowspace optimised + * selection pass if we haven't already. + */ + if (!(flags & XFS_PICK_LOWSPACE)) { + flags |= XFS_PICK_LOWSPACE; + goto restart; + } - /* - * We must be low on data space, so run a final lowspace - * optimised selection pass if we haven't already. - */ - if (!(flags & XFS_PICK_LOWSPACE)) { - flags |= XFS_PICK_LOWSPACE; - goto restart; + /* + * No unassociated AGs are available, so select the AG with the most + * free space, regardless of whether it's already in use by another + * filestream. It none suit, just use whatever AG we can grab. + */ + if (!max_pag) { + for_each_perag_wrap(args->mp, 0, start_agno, pag) { + max_pag = pag; + break; } - /* - * No unassociated AGs are available, so select the AG with the - * most free space, regardless of whether it's already in use by - * another filestream. It none suit, just use whatever AG we can - * grab. - */ - if (!max_pag) { - for_each_perag_wrap(args->mp, 0, start_agno, args->pag) - break; - atomic_inc(&args->pag->pagf_fstrms); - *longest = 0; - } else { - pag = max_pag; - free = maxfree; - atomic_inc(&pag->pagf_fstrms); - } - } else if (max_pag) { - xfs_perag_rele(max_pag); + /* Bail if there are no AGs at all to select from. */ + if (!max_pag) + return -ENOSPC; } - trace_xfs_filestream_pick(pag, pino, free); + pag = max_pag; + atomic_inc(&pag->pagf_fstrms); +done: + trace_xfs_filestream_pick(pag, pino); args->pag = pag; return 0; - } static struct xfs_inode * diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 3643cc843f62..b247d895c276 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -87,6 +87,7 @@ xfs_growfs_data_private( struct xfs_mount *mp, /* mount point for filesystem */ struct xfs_growfs_data *in) /* growfs data input struct */ { + xfs_agnumber_t oagcount = mp->m_sb.sb_agcount; struct xfs_buf *bp; int error; xfs_agnumber_t nagcount; @@ -94,7 +95,6 @@ xfs_growfs_data_private( xfs_rfsblock_t nb, nb_div, nb_mod; int64_t delta; bool lastag_extended = false; - xfs_agnumber_t oagcount; struct xfs_trans *tp; struct aghdr_init_data id = {}; struct xfs_perag *last_pag; @@ -138,16 +138,14 @@ xfs_growfs_data_private( if (delta == 0) return 0; - oagcount = mp->m_sb.sb_agcount; - /* allocate the new per-ag structures */ - if (nagcount > oagcount) { - error = xfs_initialize_perag(mp, nagcount, nb, &nagimax); - if (error) - return error; - } else if (nagcount < oagcount) { - /* TODO: shrinking the entire AGs hasn't yet completed */ + /* TODO: shrinking the entire AGs hasn't yet completed */ + if (nagcount < oagcount) return -EINVAL; - } + + /* allocate the new per-ag structures */ + error = xfs_initialize_perag(mp, oagcount, nagcount, nb, &nagimax); + if (error) + return error; if (delta > 0) error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, @@ -231,7 +229,7 @@ out_trans_cancel: xfs_trans_cancel(tp); out_free_unused_perag: if (nagcount > oagcount) - xfs_free_unused_perag_range(mp, oagcount, nagcount); + xfs_free_perag_range(mp, oagcount, nagcount); return error; } diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index 49e5e5f04e60..f19fce557354 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -85,22 +85,23 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f = EMPTY_FD; struct path path; int error; struct xfs_inode *ip; if (cmd == XFS_IOC_FD_TO_HANDLE) { - f = fdget(hreq->fd); - if (!fd_file(f)) + CLASS(fd, f)(hreq->fd); + + if (fd_empty(f)) return -EBADF; - inode = file_inode(fd_file(f)); + path = fd_file(f)->f_path; + path_get(&path); } else { error = user_path_at(AT_FDCWD, hreq->path, 0, &path); if (error) return error; - inode = d_inode(path.dentry); } + inode = d_inode(path.dentry); ip = XFS_I(inode); /* @@ -134,10 +135,7 @@ xfs_find_handle( error = 0; out_put: - if (cmd == XFS_IOC_FD_TO_HANDLE) - fdput(f); - else - path_put(&path); + path_put(&path); return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bcc277fc0a83..19dcb569a3e7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1409,7 +1409,7 @@ xfs_inactive( if (S_ISREG(VFS_I(ip)->i_mode) && (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) + xfs_inode_has_filedata(ip))) truncate = 1; if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 97ed912306fd..a2a6b5fd2545 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -292,6 +292,11 @@ static inline bool xfs_is_cow_inode(struct xfs_inode *ip) return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); } +static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip) +{ + return ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0; +} + /* * Check if an inode has any data in the COW fork. This might be often false * even for inodes with the reflink flag when there is no pending COW operation. @@ -327,6 +332,21 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) +static inline bool +xfs_inode_can_atomicwrite( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + + if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min) + return false; + if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max) + return false; + + return true; +} + /* * In-core inode flags. */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a20d426ef021..af1bb5db1c59 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -481,7 +481,7 @@ xfs_ioctl_setattr_xflags( if (rtflag != XFS_IS_REALTIME_INODE(ip)) { /* Can't change realtime flag if any extents are allocated. */ - if (ip->i_df.if_nextents || ip->i_delayed_blks) + if (xfs_inode_has_filedata(ip)) return -EINVAL; /* @@ -602,7 +602,7 @@ xfs_ioctl_setattr_check_extsize( if (!fa->fsx_valid) return 0; - if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && + if (S_ISREG(VFS_I(ip)->i_mode) && xfs_inode_has_filedata(ip) && XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize) return -EINVAL; @@ -881,41 +881,29 @@ xfs_ioc_swapext( xfs_swapext_t *sxp) { xfs_inode_t *ip, *tip; - struct fd f, tmp; - int error = 0; /* Pull information for the target fd */ - f = fdget((int)sxp->sx_fdtarget); - if (!fd_file(f)) { - error = -EINVAL; - goto out; - } + CLASS(fd, f)((int)sxp->sx_fdtarget); + if (fd_empty(f)) + return -EINVAL; if (!(fd_file(f)->f_mode & FMODE_WRITE) || !(fd_file(f)->f_mode & FMODE_READ) || - (fd_file(f)->f_flags & O_APPEND)) { - error = -EBADF; - goto out_put_file; - } + (fd_file(f)->f_flags & O_APPEND)) + return -EBADF; - tmp = fdget((int)sxp->sx_fdtmp); - if (!fd_file(tmp)) { - error = -EINVAL; - goto out_put_file; - } + CLASS(fd, tmp)((int)sxp->sx_fdtmp); + if (fd_empty(tmp)) + return -EINVAL; if (!(fd_file(tmp)->f_mode & FMODE_WRITE) || !(fd_file(tmp)->f_mode & FMODE_READ) || - (fd_file(tmp)->f_flags & O_APPEND)) { - error = -EBADF; - goto out_put_tmp_file; - } + (fd_file(tmp)->f_flags & O_APPEND)) + return -EBADF; if (IS_SWAPFILE(file_inode(fd_file(f))) || - IS_SWAPFILE(file_inode(fd_file(tmp)))) { - error = -EINVAL; - goto out_put_tmp_file; - } + IS_SWAPFILE(file_inode(fd_file(tmp)))) + return -EINVAL; /* * We need to ensure that the fds passed in point to XFS inodes @@ -923,37 +911,22 @@ xfs_ioc_swapext( * control over what the user passes us here. */ if (fd_file(f)->f_op != &xfs_file_operations || - fd_file(tmp)->f_op != &xfs_file_operations) { - error = -EINVAL; - goto out_put_tmp_file; - } + fd_file(tmp)->f_op != &xfs_file_operations) + return -EINVAL; ip = XFS_I(file_inode(fd_file(f))); tip = XFS_I(file_inode(fd_file(tmp))); - if (ip->i_mount != tip->i_mount) { - error = -EINVAL; - goto out_put_tmp_file; - } - - if (ip->i_ino == tip->i_ino) { - error = -EINVAL; - goto out_put_tmp_file; - } + if (ip->i_mount != tip->i_mount) + return -EINVAL; - if (xfs_is_shutdown(ip->i_mount)) { - error = -EIO; - goto out_put_tmp_file; - } + if (ip->i_ino == tip->i_ino) + return -EINVAL; - error = xfs_swap_extents(ip, tip, sxp); + if (xfs_is_shutdown(ip->i_mount)) + return -EIO; - out_put_tmp_file: - fdput(tmp); - out_put_file: - fdput(f); - out: - return error; + return xfs_swap_extents(ip, tip, sxp); } static int diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1e11f48814c0..86da16f54be9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -707,7 +707,7 @@ imap_needs_cow( return false; /* when zeroing we don't have to COW holes or unwritten extents */ - if (flags & IOMAP_ZERO) { + if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) { if (!nimaps || imap->br_startblock == HOLESTARTBLOCK || imap->br_state == XFS_EXT_UNWRITTEN) @@ -975,6 +975,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; unsigned int lockmode = XFS_ILOCK_EXCL; + unsigned int iomap_flags = 0; u64 seq; if (xfs_is_shutdown(mp)) @@ -1145,6 +1146,11 @@ xfs_buffered_write_iomap_begin( } } + /* + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. + */ + iomap_flags |= IOMAP_F_NEW; if (allocfork == XFS_COW_FORK) { error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, end_fsb - offset_fsb, prealloc_blocks, &cmap, @@ -1162,19 +1168,11 @@ xfs_buffered_write_iomap_begin( if (error) goto out_unlock; - /* - * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch - * them out if the write happens to fail. - */ - seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); - xfs_iunlock(ip, lockmode); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); - found_imap: - seq = xfs_iomap_inode_sequence(ip, 0); + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); convert_delay: xfs_iunlock(ip, lockmode); @@ -1188,20 +1186,20 @@ convert_delay: return 0; found_cow: - seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, + xfs_iomap_inode_sequence(ip, 0)); if (error) goto out_unlock; - seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); - xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED, seq); + } else { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); } - xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); + iomap_flags |= IOMAP_F_SHARED; + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, iomap_flags, seq); out_unlock: xfs_iunlock(ip, lockmode); @@ -1215,7 +1213,10 @@ xfs_buffered_write_delalloc_punch( loff_t length, struct iomap *iomap) { - xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length); + xfs_bmap_punch_delalloc_range(XFS_I(inode), + (iomap->flags & IOMAP_F_SHARED) ? + XFS_COW_FORK : XFS_DATA_FORK, + offset, offset + length); } static int @@ -1227,8 +1228,30 @@ xfs_buffered_write_iomap_end( unsigned flags, struct iomap *iomap) { - iomap_file_buffered_write_punch_delalloc(inode, offset, length, written, - flags, iomap, &xfs_buffered_write_delalloc_punch); + loff_t start_byte, end_byte; + + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW)) + return 0; + + /* Nothing to do if we've written the entire delalloc extent */ + start_byte = iomap_last_written_block(inode, offset, written); + end_byte = round_up(offset + length, i_blocksize(inode)); + if (start_byte >= end_byte) + return 0; + + /* For zeroing operations the callers already hold invalidate_lock. */ + if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) { + rwsem_assert_held_write(&inode->i_mapping->invalidate_lock); + iomap_write_delalloc_release(inode, start_byte, end_byte, flags, + iomap, xfs_buffered_write_delalloc_punch); + } else { + filemap_invalidate_lock(inode->i_mapping); + iomap_write_delalloc_release(inode, start_byte, end_byte, flags, + iomap, xfs_buffered_write_delalloc_punch); + filemap_invalidate_unlock(inode->i_mapping); + } + return 0; } @@ -1435,6 +1458,8 @@ xfs_zero_range( { struct inode *inode = VFS_I(ip); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + if (IS_DAX(inode)) return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ee79cf161312..4084d26f0d78 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -570,6 +570,20 @@ xfs_stat_blksize( return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize); } +static void +xfs_get_atomic_write_attr( + struct xfs_inode *ip, + unsigned int *unit_min, + unsigned int *unit_max) +{ + if (!xfs_inode_can_atomicwrite(ip)) { + *unit_min = *unit_max = 0; + return; + } + + *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize; +} + STATIC int xfs_vn_getattr( struct mnt_idmap *idmap, @@ -597,8 +611,9 @@ xfs_vn_getattr( stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode_get_atime(inode); - stat->mtime = inode_get_mtime(inode); - stat->ctime = inode_get_ctime(inode); + + fill_mg_cmtime(stat, request_mask, inode); + stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); if (xfs_has_v3inodes(mp)) { @@ -608,11 +623,6 @@ xfs_vn_getattr( } } - if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { - stat->change_cookie = inode_query_iversion(inode); - stat->result_mask |= STATX_CHANGE_COOKIE; - } - /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. @@ -643,6 +653,14 @@ xfs_vn_getattr( stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); } + if (request_mask & STATX_WRITE_ATOMIC) { + unsigned int unit_min, unit_max; + + xfs_get_atomic_write_attr(ip, &unit_min, + &unit_max); + generic_fill_statx_atomic_writes(stat, + unit_min, unit_max); + } fallthrough; default: stat->blksize = xfs_stat_blksize(ip); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a13bf53fea49..704aaadb61cf 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3393,13 +3393,6 @@ xlog_do_recover( /* re-initialise in-core superblock and geometry structures */ mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); - error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks, - &mp->m_maxagi); - if (error) { - xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); - return error; - } - mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); /* Normal transactions can now occur */ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 1fdd79c5bfa0..25bbcc3f4ee0 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -810,8 +810,8 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks, - &mp->m_maxagi); + error = xfs_initialize_perag(mp, 0, sbp->sb_agcount, + mp->m_sb.sb_dblocks, &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed per-ag init: %d", error); goto out_free_dir; @@ -1048,7 +1048,7 @@ xfs_mountfs( xfs_buftarg_drain(mp->m_logdev_targp); xfs_buftarg_drain(mp->m_ddev_targp); out_free_perag: - xfs_free_perag(mp); + xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); out_free_dir: xfs_da_unmount(mp); out_remove_uuid: @@ -1129,8 +1129,7 @@ xfs_unmountfs( xfs_errortag_clearall(mp); #endif shrinker_free(mp->m_inodegc_shrinker); - xfs_free_perag(mp); - + xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); xfs_errortag_del(mp); xfs_error_sysfs_del(mp); xchk_stats_unregister(mp->m_scrub_stats); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fbb3a1594c0d..fda75db739b1 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2063,7 +2063,7 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = xfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("xfs"); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ee9f0b1f548d..fcb2bad4f76e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -691,8 +691,8 @@ DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); TRACE_EVENT(xfs_filestream_pick, - TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino, xfs_extlen_t free), - TP_ARGS(pag, ino, free), + TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), + TP_ARGS(pag, ino), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -703,14 +703,9 @@ TRACE_EVENT(xfs_filestream_pick, TP_fast_assign( __entry->dev = pag->pag_mount->m_super->s_dev; __entry->ino = ino; - if (pag) { - __entry->agno = pag->pag_agno; - __entry->streams = atomic_read(&pag->pagf_fstrms); - } else { - __entry->agno = NULLAGNUMBER; - __entry->streams = 0; - } - __entry->free = free; + __entry->agno = pag->pag_agno; + __entry->streams = atomic_read(&pag->pagf_fstrms); + __entry->free = pag->pagf_freeblks; ), TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d", MAJOR(__entry->dev), MINOR(__entry->dev), |