diff options
Diffstat (limited to 'fs')
381 files changed, 6553 insertions, 4810 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 6261719f6f2a..bb1b286c49ae 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -239,6 +239,7 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler, } static int v9fs_xattr_set_acl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -258,7 +259,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; if (value) { /* update the cached acl value */ @@ -279,7 +280,8 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, struct iattr iattr = { 0 }; struct posix_acl *old_acl = acl; - retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); + retval = posix_acl_update_mode(&init_user_ns, inode, + &iattr.ia_mode, &acl); if (retval) goto err_out; if (!acl) { @@ -297,7 +299,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, * What is the following setxattr update the * mode ? */ - v9fs_vfs_setattr_dotl(dentry, &iattr); + v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr); } break; case ACL_TYPE_DEFAULT: diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 7b763776306e..4ca56c5dd637 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -135,7 +135,8 @@ extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); -extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, +extern int v9fs_vfs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index fd2a2b040250..d44ade76966a 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -59,7 +59,8 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); int v9fs_uflags2omode(int uflags, int extended); void v9fs_blank_wstat(struct p9_wstat *wstat); -int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *); +int v9fs_vfs_setattr_dotl(struct user_namespace *, struct dentry *, + struct iattr *); int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, int datasync); int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 4a937fac1acb..8d97f0b45e9c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -251,7 +251,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, { int err = 0; - inode_init_owner(inode, NULL, mode); + inode_init_owner(&init_user_ns,inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = rdev; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); @@ -676,8 +676,8 @@ error: */ static int -v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); u32 perm = unixmode2p9mode(v9ses, mode); @@ -702,7 +702,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, * */ -static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int err; u32 perm; @@ -907,9 +908,9 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) */ int -v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { int retval; struct inode *old_inode; @@ -1016,8 +1017,8 @@ done: */ static int -v9fs_vfs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct v9fs_session_info *v9ses; @@ -1027,7 +1028,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -1040,7 +1041,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, return PTR_ERR(st); v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); p9stat_free(st); kfree(st); @@ -1054,7 +1055,8 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, * */ -static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) +static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; struct v9fs_session_info *v9ses; @@ -1062,7 +1064,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) struct p9_wstat wstat; p9_debug(P9_DEBUG_VFS, "\n"); - retval = setattr_prepare(dentry, iattr); + retval = setattr_prepare(&init_user_ns, dentry, iattr); if (retval) return retval; @@ -1118,7 +1120,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) v9fs_invalidate_inode_attr(d_inode(dentry)); - setattr_copy(d_inode(dentry), iattr); + setattr_copy(&init_user_ns, d_inode(dentry), iattr); mark_inode_dirty(d_inode(dentry)); return 0; } @@ -1137,9 +1139,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb, unsigned int flags) { umode_t mode; - char ext[32]; - char tag_name[14]; - unsigned int i_nlink; struct v9fs_session_info *v9ses = sb->s_fs_info; struct v9fs_inode *v9inode = V9FS_I(inode); @@ -1157,18 +1156,18 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, inode->i_gid = stat->n_gid; } if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) { - if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) { + if (v9fs_proto_dotu(v9ses)) { + unsigned int i_nlink; /* - * Hadlink support got added later to - * to the .u extension. So there can be - * server out there that doesn't support - * this even with .u extension. So check - * for non NULL stat->extension + * Hadlink support got added later to the .u extension. + * So there can be a server out there that doesn't + * support this even with .u extension. That would + * just leave us with stat->extension being an empty + * string, though. */ - strlcpy(ext, stat->extension, sizeof(ext)); /* HARDLINKCOUNT %u */ - sscanf(ext, "%13s %u", tag_name, &i_nlink); - if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) + if (sscanf(stat->extension, + " HARDLINKCOUNT %u", &i_nlink) == 1) set_nlink(inode, i_nlink); } } @@ -1295,7 +1294,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, */ static int -v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +v9fs_vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n", dir->i_ino, dentry, symname); @@ -1348,7 +1348,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, */ static int -v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +v9fs_vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); int retval; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 823c2eb5f1bf..1dc7af046615 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -33,8 +33,8 @@ #include "acl.h" static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, - dev_t rdev); +v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t omode, dev_t rdev); /** * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a @@ -218,10 +218,10 @@ int v9fs_open_to_dotl_flags(int flags) */ static int -v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, - bool excl) +v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t omode, bool excl) { - return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); + return v9fs_vfs_mknod_dotl(mnt_userns, dir, dentry, omode, 0); } static int @@ -367,8 +367,9 @@ err_clunk_old_fid: * */ -static int v9fs_vfs_mkdir_dotl(struct inode *dir, - struct dentry *dentry, umode_t omode) +static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t omode) { int err; struct v9fs_session_info *v9ses; @@ -457,8 +458,9 @@ error: } static int -v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct v9fs_session_info *v9ses; @@ -468,7 +470,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -485,7 +487,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -540,7 +542,8 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) * */ -int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) +int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; struct p9_fid *fid = NULL; @@ -549,7 +552,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) p9_debug(P9_DEBUG_VFS, "\n"); - retval = setattr_prepare(dentry, iattr); + retval = setattr_prepare(&init_user_ns, dentry, iattr); if (retval) return retval; @@ -590,7 +593,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) truncate_setsize(inode, iattr->ia_size); v9fs_invalidate_inode_attr(inode); - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { /* We also want to update ACL when we update mode bits */ @@ -684,8 +687,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, } static int -v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, - const char *symname) +v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { int err; kgid_t gid; @@ -824,8 +827,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, * */ static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, - dev_t rdev) +v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t omode, dev_t rdev) { int err; kgid_t gid; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 87217dd0433e..ee331845e2c7 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -157,6 +157,7 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler, } static int v9fs_xattr_handler_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 699c4fa8b78b..06b7c92343ad 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -144,7 +144,8 @@ struct adfs_discmap { /* Inode stuff */ struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); -int adfs_notify_change(struct dentry *dentry, struct iattr *attr); +int adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); /* map.c */ int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 32620f4a7623..fb7ee026d101 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -292,14 +292,15 @@ out: * later. */ int -adfs_notify_change(struct dentry *dentry, struct iattr *attr) +adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; unsigned int ia_valid = attr->ia_valid; int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); /* * we can't change the UID or GID of any file - diff --git a/fs/affs/affs.h b/fs/affs/affs.h index a755bef7c4c7..bfa89e131ead 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -167,27 +167,33 @@ extern const struct export_operations affs_export_ops; extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int); extern int affs_unlink(struct inode *dir, struct dentry *dentry); -extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool); -extern int affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); +extern int affs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool); +extern int affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); -extern int affs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname); -extern int affs_rename2(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags); +extern int affs_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const char *symname); +extern int affs_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags); /* inode.c */ extern struct inode *affs_new_inode(struct inode *dir); -extern int affs_notify_change(struct dentry *dentry, struct iattr *attr); +extern int affs_notify_change(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr); extern void affs_evict_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, unsigned long ino); extern int affs_write_inode(struct inode *inode, struct writeback_control *wbc); -extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); +extern int affs_add_entry(struct inode *dir, struct inode *inode, + struct dentry *dentry, s32 type); /* file.c */ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 044412110b52..2352a75bd9d6 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -216,14 +216,15 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) } int -affs_notify_change(struct dentry *dentry, struct iattr *attr) +affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) goto out; @@ -249,7 +250,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr) affs_truncate(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); if (attr->ia_valid & ATTR_MODE) diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 5400a876d73f..bcab18956b4f 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -242,7 +242,8 @@ affs_unlink(struct inode *dir, struct dentry *dentry) } int -affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +affs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -273,7 +274,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) } int -affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; int error; @@ -311,7 +313,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry) } int -affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +affs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct super_block *sb = dir->i_sb; struct buffer_head *bh; @@ -500,9 +503,9 @@ done: return retval; } -int affs_rename2(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +int affs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 7bd659ad959e..714fcca9af99 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -28,18 +28,19 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int loff_t fpos, u64 ino, unsigned dtype); static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl); -static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); +static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl); +static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, struct dentry *dentry); -static int afs_symlink(struct inode *dir, struct dentry *dentry, - const char *content); -static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags); +static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *content); +static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags); static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags); static void afs_dir_invalidatepage(struct page *page, unsigned int offset, unsigned int length); @@ -1325,7 +1326,8 @@ static const struct afs_operation_ops afs_mkdir_operation = { /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); @@ -1619,8 +1621,8 @@ static const struct afs_operation_ops afs_create_operation = { /* * create a regular file on an AFS filesystem */ -static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); @@ -1741,8 +1743,8 @@ static const struct afs_operation_ops afs_symlink_operation = { /* * create a symlink in an AFS filesystem */ -static int afs_symlink(struct inode *dir, struct dentry *dentry, - const char *content) +static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *content) { struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); @@ -1876,9 +1878,9 @@ static const struct afs_operation_ops afs_rename_operation = { /* * rename a file in an AFS filesystem and/or move it between directories */ -static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct afs_operation *op; struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index b0d7b892090d..1156b2df28d3 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -734,8 +734,8 @@ error_unlock: /* * read the attributes of an inode */ -int afs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct afs_vnode *vnode = AFS_FS_I(inode); @@ -745,7 +745,7 @@ int afs_getattr(const struct path *path, struct kstat *stat, do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; @@ -857,7 +857,8 @@ static const struct afs_operation_ops afs_setattr_operation = { /* * set the attributes of an inode */ -int afs_setattr(struct dentry *dentry, struct iattr *attr) +int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 0d150a29e39e..b626e38e9ab5 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1149,8 +1149,9 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern bool afs_check_validity(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); -extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int); -extern int afs_setattr(struct dentry *, struct iattr *); +extern int afs_getattr(struct user_namespace *mnt_userns, const struct path *, + struct kstat *, u32, unsigned int); +extern int afs_setattr(struct user_namespace *mnt_userns, struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); @@ -1361,7 +1362,7 @@ extern void afs_zap_permits(struct rcu_head *); extern struct key *afs_request_key(struct afs_cell *); extern struct key *afs_request_key_rcu(struct afs_cell *); extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); -extern int afs_permission(struct inode *, int); +extern int afs_permission(struct user_namespace *, struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); /* diff --git a/fs/afs/security.c b/fs/afs/security.c index 9cf3102f370c..3c7a8fc4f93f 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -396,7 +396,8 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ -int afs_permission(struct inode *inode, int mask) +int afs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { struct afs_vnode *vnode = AFS_FS_I(inode); afs_access_t access; diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 95c573dcda11..c629caae5002 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -120,6 +120,7 @@ static const struct afs_operation_ops afs_store_acl_operation = { * Set a file's AFS3 ACL. */ static int afs_xattr_set_acl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -248,6 +249,7 @@ static const struct afs_operation_ops yfs_store_opaque_acl2_operation = { * Set a file's YFS ACL. */ static int afs_xattr_set_yfs(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/attr.c b/fs/attr.c index b4bbdbd4c8ca..87ef39db1c34 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -18,27 +18,55 @@ #include <linux/evm.h> #include <linux/ima.h> -static bool chown_ok(const struct inode *inode, kuid_t uid) +/** + * chown_ok - verify permissions to chown inode + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check permissions on + * @uid: uid to chown @inode to + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + */ +static bool chown_ok(struct user_namespace *mnt_userns, + const struct inode *inode, + kuid_t uid) { - if (uid_eq(current_fsuid(), inode->i_uid) && - uid_eq(uid, inode->i_uid)) + kuid_t kuid = i_uid_into_mnt(mnt_userns, inode); + if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, kuid)) return true; - if (capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (uid_eq(inode->i_uid, INVALID_UID) && + if (uid_eq(kuid, INVALID_UID) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; } -static bool chgrp_ok(const struct inode *inode, kgid_t gid) +/** + * chgrp_ok - verify permissions to chgrp inode + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check permissions on + * @gid: gid to chown @inode to + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + */ +static bool chgrp_ok(struct user_namespace *mnt_userns, + const struct inode *inode, kgid_t gid) { - if (uid_eq(current_fsuid(), inode->i_uid) && - (in_group_p(gid) || gid_eq(gid, inode->i_gid))) + kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); + if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) && + (in_group_p(gid) || gid_eq(gid, kgid))) return true; - if (capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (gid_eq(inode->i_gid, INVALID_GID) && + if (gid_eq(kgid, INVALID_GID) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -46,6 +74,7 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid) /** * setattr_prepare - check if attribute changes to a dentry are allowed + * @mnt_userns: user namespace of the mount the inode was found from * @dentry: dentry to check * @attr: attributes to change * @@ -55,10 +84,17 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid) * SGID bit from mode if user is not allowed to set it. Also file capabilities * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set. * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. */ -int setattr_prepare(struct dentry *dentry, struct iattr *attr) +int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; @@ -78,27 +114,27 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr) goto kill_priv; /* Make sure a caller can chown. */ - if ((ia_valid & ATTR_UID) && !chown_ok(inode, attr->ia_uid)) + if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid)) return -EPERM; /* Make sure caller can chgrp. */ - if ((ia_valid & ATTR_GID) && !chgrp_ok(inode, attr->ia_gid)) + if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; /* Also check the setgid bit! */ - if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : - inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : + i_gid_into_mnt(mnt_userns, inode)) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; } @@ -107,7 +143,7 @@ kill_priv: if (ia_valid & ATTR_KILL_PRIV) { int error; - error = security_inode_killpriv(dentry); + error = security_inode_killpriv(mnt_userns, dentry); if (error) return error; } @@ -162,20 +198,33 @@ EXPORT_SYMBOL(inode_newsize_ok); /** * setattr_copy - copy simple metadata updates into the generic inode + * @mnt_userns: user namespace of the mount the inode was found from * @inode: the inode to be updated * @attr: the new attributes * * setattr_copy must be called with i_mutex held. * * setattr_copy updates the inode's metadata with that specified - * in attr. Noticeably missing is inode size update, which is more complex + * in attr on idmapped mounts. If file ownership is changed setattr_copy + * doesn't map ia_uid and ia_gid. It will asssume the caller has already + * provided the intended values. Necessary permission checks to determine + * whether or not the S_ISGID property needs to be removed are performed with + * the correct idmapped mount permission helpers. + * Noticeably missing is inode size update, which is more complex * as it requires pagecache updates. * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + * * The inode is not marked as dirty after this operation. The rationale is * that for "simple" filesystems, the struct inode is the inode storage. * The caller is free to mark the inode dirty afterwards if needed. */ -void setattr_copy(struct inode *inode, const struct iattr *attr) +void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, + const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; @@ -191,9 +240,9 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - - if (!in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); + if (!in_group_p(kgid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } @@ -202,6 +251,7 @@ EXPORT_SYMBOL(setattr_copy); /** * notify_change - modify attributes of a filesytem object + * @mnt_userns: user namespace of the mount the inode was found from * @dentry: object affected * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated @@ -214,13 +264,23 @@ EXPORT_SYMBOL(setattr_copy); * retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * + * If file ownership is changed notify_change() doesn't map ia_uid and + * ia_gid. It will asssume the caller has already provided the intended values. + * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. Also, passing NULL is fine for callers holding * the file open for write, as there can be no conflicting delegation in * that case. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. */ -int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode) +int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; @@ -243,8 +303,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de if (IS_IMMUTABLE(inode)) return -EPERM; - if (!inode_owner_or_capable(inode)) { - error = inode_permission(inode, MAY_WRITE); + if (!inode_owner_or_capable(mnt_userns, inode)) { + error = inode_permission(mnt_userns, inode, MAY_WRITE); if (error) return error; } @@ -320,9 +380,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de /* Don't allow modifications of files with invalid uids or * gids unless those uids & gids are being made valid. */ - if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid)) + if (!(ia_valid & ATTR_UID) && + !uid_valid(i_uid_into_mnt(mnt_userns, inode))) return -EOVERFLOW; - if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid)) + if (!(ia_valid & ATTR_GID) && + !gid_valid(i_gid_into_mnt(mnt_userns, inode))) return -EOVERFLOW; error = security_inode_setattr(dentry, attr); @@ -333,13 +395,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de return error; if (inode->i_op->setattr) - error = inode->i_op->setattr(dentry, attr); + error = inode->i_op->setattr(mnt_userns, dentry, attr); else - error = simple_setattr(dentry, attr); + error = simple_setattr(mnt_userns, dentry, attr); if (!error) { fsnotify_change(dentry, ia_valid); - ima_inode_post_setattr(dentry); + ima_inode_post_setattr(mnt_userns, dentry); evm_inode_post_setattr(dentry, ia_valid); } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 5aaa1732bf1e..91fe4548c256 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -10,10 +10,12 @@ #include "autofs_i.h" -static int autofs_dir_symlink(struct inode *, struct dentry *, const char *); +static int autofs_dir_symlink(struct user_namespace *, struct inode *, + struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); static int autofs_dir_rmdir(struct inode *, struct dentry *); -static int autofs_dir_mkdir(struct inode *, struct dentry *, umode_t); +static int autofs_dir_mkdir(struct user_namespace *, struct inode *, + struct dentry *, umode_t); static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT static long autofs_root_compat_ioctl(struct file *, @@ -524,9 +526,9 @@ static struct dentry *autofs_lookup(struct inode *dir, return NULL; } -static int autofs_dir_symlink(struct inode *dir, - struct dentry *dentry, - const char *symname) +static int autofs_dir_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const char *symname) { struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); @@ -715,8 +717,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs_dir_mkdir(struct inode *dir, - struct dentry *dentry, umode_t mode) +static int autofs_dir_mkdir(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t mode) { struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 54f0ce444272..48e16144c1f7 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -27,8 +27,9 @@ static const struct file_operations bad_file_ops = .open = bad_file_open, }; -static int bad_inode_create (struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +static int bad_inode_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) { return -EIO; } @@ -50,14 +51,15 @@ static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) return -EIO; } -static int bad_inode_symlink (struct inode *dir, struct dentry *dentry, - const char *symname) +static int bad_inode_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const char *symname) { return -EIO; } -static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry, - umode_t mode) +static int bad_inode_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { return -EIO; } @@ -67,13 +69,14 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) return -EIO; } -static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int bad_inode_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } -static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry, +static int bad_inode_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -86,18 +89,21 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, return -EIO; } -static int bad_inode_permission(struct inode *inode, int mask) +static int bad_inode_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { return -EIO; } -static int bad_inode_getattr(const struct path *path, struct kstat *stat, +static int bad_inode_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } -static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs) +static int bad_inode_setattr(struct user_namespace *mnt_userns, + struct dentry *direntry, struct iattr *attrs) { return -EIO; } @@ -140,13 +146,15 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, return -EIO; } -static int bad_inode_tmpfile(struct inode *inode, struct dentry *dentry, +static int bad_inode_tmpfile(struct user_namespace *mnt_userns, + struct inode *inode, struct dentry *dentry, umode_t mode) { return -EIO; } -static int bad_inode_set_acl(struct inode *inode, struct posix_acl *acl, +static int bad_inode_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, struct posix_acl *acl, int type) { return -EIO; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index d8dfe3a0cb39..34d4f68f786b 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -75,8 +75,8 @@ const struct file_operations bfs_dir_operations = { .llseek = generic_file_llseek, }; -static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int bfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { int err; struct inode *inode; @@ -96,7 +96,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } set_bit(ino, info->si_imap); info->si_freei--; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; @@ -199,9 +199,9 @@ out_brelse: return error; } -static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int bfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh; diff --git a/fs/block_dev.c b/fs/block_dev.c index ec26179c8062..4aa1f88d5bf8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -221,7 +221,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio) static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, - int nr_pages) + unsigned int nr_pages) { struct file *file = iocb->ki_filp; struct block_device *bdev = I_BDEV(bdev_file_inode(file)); @@ -355,8 +355,8 @@ static void blkdev_bio_end_io(struct bio *bio) } } -static ssize_t -__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) +static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + unsigned int nr_pages) { struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); @@ -486,7 +486,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { - int nr_pages; + unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; @@ -495,7 +495,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES)); + return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } static __init int blkdev_init(void) @@ -1270,7 +1270,7 @@ rescan: return ret; } /* - * Only exported for for loop and dasd for historic reasons. Don't use in new + * Only exported for loop and dasd for historic reasons. Don't use in new * code! */ EXPORT_SYMBOL_GPL(bdev_disk_changed); diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index a0af1b952c4d..d95eb5c8cb37 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -107,13 +107,15 @@ out: return ret; } -int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int ret; umode_t old_mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + ret = posix_acl_update_mode(&init_user_ns, inode, + &inode->i_mode, &acl); if (ret) return ret; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5064be59dac5..744b99ddc28c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1162,6 +1162,11 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) spin_lock(&sinfo->lock); spin_lock(&cache->lock); + if (cache->swap_extents) { + ret = -ETXTBSY; + goto out; + } + if (cache->ro) { cache->ro++; ret = 0; @@ -2307,7 +2312,7 @@ again: } ret = inc_block_group_ro(cache, 0); - if (!do_chunk_alloc) + if (!do_chunk_alloc || ret == -ETXTBSY) goto unlock_out; if (!ret) goto out; @@ -2316,6 +2321,8 @@ again: if (ret < 0) goto out; ret = inc_block_group_ro(cache, 0); + if (ret == -ETXTBSY) + goto unlock_out; out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); @@ -3406,6 +3413,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) ASSERT(list_empty(&block_group->io_list)); ASSERT(list_empty(&block_group->bg_list)); ASSERT(refcount_read(&block_group->refs) == 1); + ASSERT(block_group->swap_extents == 0); btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); @@ -3472,3 +3480,26 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) __btrfs_remove_free_space_cache(block_group->free_space_ctl); } } + +bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg) +{ + bool ret = true; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + bg->swap_extents++; + spin_unlock(&bg->lock); + + return ret; +} + +void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount) +{ + spin_lock(&bg->lock); + ASSERT(!bg->ro); + ASSERT(bg->swap_extents >= amount); + bg->swap_extents -= amount; + spin_unlock(&bg->lock); +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 29678426247d..3ecc3372a5ce 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -186,6 +186,12 @@ struct btrfs_block_group { /* Flag indicating this block group is placed on a sequential zone */ bool seq_zone; + /* + * Number of extents in this block group used for swap files. + * All accesses protected by the spinlock 'lock'. + */ + int swap_extents; + /* Record locked full stripes for RAID5/6 block group */ struct btrfs_full_stripe_locks_tree full_stripe_locks_root; @@ -312,4 +318,7 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache) void btrfs_freeze_block_group(struct btrfs_block_group *cache); void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); +bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); +void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); + #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 6d203acfdeb3..3f4c832abfed 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,6 +141,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); const u32 csum_size = fs_info->csum_size; + const u32 sectorsize = fs_info->sectorsize; struct page *page; unsigned long i; char *kaddr; @@ -154,22 +155,34 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, shash->tfm = fs_info->csum_shash; for (i = 0; i < cb->nr_pages; i++) { + u32 pg_offset; + u32 bytes_left = PAGE_SIZE; page = cb->compressed_pages[i]; - kaddr = kmap_atomic(page); - crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum); - kunmap_atomic(kaddr); - - if (memcmp(&csum, cb_sum, csum_size)) { - btrfs_print_data_csum_error(inode, disk_start, - csum, cb_sum, cb->mirror_num); - if (btrfs_io_bio(bio)->device) - btrfs_dev_stat_inc_and_print( - btrfs_io_bio(bio)->device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - return -EIO; + /* Determine the remaining bytes inside the page first */ + if (i == cb->nr_pages - 1) + bytes_left = cb->compressed_len - i * PAGE_SIZE; + + /* Hash through the page sector by sector */ + for (pg_offset = 0; pg_offset < bytes_left; + pg_offset += sectorsize) { + kaddr = kmap_atomic(page); + crypto_shash_digest(shash, kaddr + pg_offset, + sectorsize, csum); + kunmap_atomic(kaddr); + + if (memcmp(&csum, cb_sum, csum_size) != 0) { + btrfs_print_data_csum_error(inode, disk_start, + csum, cb_sum, cb->mirror_num); + if (btrfs_io_bio(bio)->device) + btrfs_dev_stat_inc_and_print( + btrfs_io_bio(bio)->device, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + return -EIO; + } + cb_sum += csum_size; + disk_start += sectorsize; } - cb_sum += csum_size; } return 0; } @@ -640,7 +653,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, page_offset(bio_first_page_all(bio)), - PAGE_SIZE); + fs_info->sectorsize); read_unlock(&em_tree->lock); if (!em) return BLK_STS_IOERR; @@ -698,19 +711,30 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, refcount_set(&cb->pending_bios, 1); for (pg_index = 0; pg_index < nr_pages; pg_index++) { + u32 pg_len = PAGE_SIZE; int submit = 0; + /* + * To handle subpage case, we need to make sure the bio only + * covers the range we need. + * + * If we're at the last page, truncate the length to only cover + * the remaining part. + */ + if (pg_index == nr_pages - 1) + pg_len = min_t(u32, PAGE_SIZE, + compressed_len - pg_index * PAGE_SIZE); + page = cb->compressed_pages[pg_index]; page->mapping = inode->i_mapping; page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, + submit = btrfs_bio_fits_in_stripe(page, pg_len, comp_bio, 0); page->mapping = NULL; - if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) { unsigned int nr_sectors; ret = btrfs_bio_wq_end_io(fs_info, comp_bio, @@ -743,9 +767,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; - bio_add_page(comp_bio, page, PAGE_SIZE, 0); + bio_add_page(comp_bio, page, pg_len, 0); } - cur_disk_byte += PAGE_SIZE; + cur_disk_byte += pg_len; } ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); @@ -1237,7 +1261,6 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long prev_start_byte; unsigned long working_bytes = total_out - buf_start; unsigned long bytes; - char *kaddr; struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter); /* @@ -1268,9 +1291,8 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, PAGE_SIZE - (buf_offset % PAGE_SIZE)); bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(bvec.bv_page); - memcpy(kaddr + bvec.bv_offset, buf + buf_offset, bytes); - kunmap_atomic(kaddr); + memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + buf_offset, + bytes); flush_dcache_page(bvec.bv_page); buf_offset += bytes; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3bc00aed13b2..9ae776ab3967 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -524,6 +524,11 @@ struct btrfs_swapfile_pin { * points to a struct btrfs_device. */ bool is_block_group; + /* + * Only used when 'is_block_group' is true and it is the number of + * extents used by a swapfile for this block group ('ptr' field). + */ + int bg_extent_count; }; bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); @@ -3635,7 +3640,8 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); -int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); #else diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index ec0b50b8c5d6..bf25401c9768 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -627,7 +627,8 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { - ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); + ret = btrfs_qgroup_reserve_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC, true); if (ret < 0) return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, @@ -649,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata( btrfs_ino(inode), num_bytes, 1); } else { - btrfs_qgroup_free_meta_prealloc(root, fs_info->nodesize); + btrfs_qgroup_free_meta_prealloc(root, num_bytes); } return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dfb3ead1175..4671c99d468d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3008,12 +3008,23 @@ readpage_ok: if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned off; - /* Zero out the end if this page straddles i_size */ - off = offset_in_page(i_size); - if (page->index == end_index && off) - zero_user_segment(page, off, PAGE_SIZE); + /* + * Zero out the remaining part if this range straddles + * i_size. + * + * Here we should only zero the range inside the bvec, + * not touch anything else. + * + * NOTE: i_size is exclusive while end is inclusive. + */ + if (page->index == end_index && i_size <= end) { + u32 zero_start = max(offset_in_page(i_size), + offset_in_page(end)); + + zero_user_segment(page, zero_start, + offset_in_page(end) + 1); + } } ASSERT(bio_offset + len > bio_offset); bio_offset += len; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index be9e3900cce8..0e155f013839 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3260,8 +3260,11 @@ reserve_space: goto out; ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); - if (ret) + if (ret) { + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); goto out; + } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, i_blocksize(inode), @@ -3634,7 +3637,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; } - return generic_file_buffered_read(iocb, to, ret); + return filemap_read(iocb, to, ret); } const struct file_operations btrfs_file_operations = { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5400294bd271..9988decd5717 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2555,7 +2555,12 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, to_unusable = size - to_free; ctl->free_space += to_free; - block_group->zone_unusable += to_unusable; + /* + * If the block group is read-only, we should account freed space into + * bytes_readonly. + */ + if (!block_group->ro) + block_group->zone_unusable += to_unusable; spin_unlock(&ctl->tree_lock); if (!used) { spin_lock(&block_group->lock); @@ -2801,8 +2806,10 @@ static void __btrfs_return_cluster_to_free_space( struct rb_node *node; spin_lock(&cluster->lock); - if (cluster->block_group != block_group) - goto out; + if (cluster->block_group != block_group) { + spin_unlock(&cluster->lock); + return; + } cluster->block_group = NULL; cluster->window_start = 0; @@ -2840,8 +2847,6 @@ static void __btrfs_return_cluster_to_free_space( entry->offset, &entry->offset_index, bitmap); } cluster->root = RB_ROOT; - -out: spin_unlock(&cluster->lock); btrfs_put_block_group(block_group); } @@ -3125,8 +3130,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, entry->bytes -= bytes; } - if (entry->bytes == 0) - rb_erase(&entry->offset_index, &cluster->root); break; } out: @@ -3143,7 +3146,10 @@ out: ctl->free_space -= bytes; if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; + + spin_lock(&cluster->lock); if (entry->bytes == 0) { + rb_erase(&entry->offset_index, &cluster->root); ctl->free_extents--; if (entry->bitmap) { kmem_cache_free(btrfs_free_space_bitmap_cachep, @@ -3156,6 +3162,7 @@ out: kmem_cache_free(btrfs_free_space_cachep, entry); } + spin_unlock(&cluster->lock); spin_unlock(&ctl->tree_lock); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 535abf898225..35bfa0533f23 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1674,9 +1674,6 @@ next_slot: */ btrfs_release_path(path); - /* If extent is RO, we must COW it */ - if (btrfs_extent_readonly(fs_info, disk_bytenr)) - goto out_check; ret = btrfs_cross_ref_exist(root, ino, found_key.offset - extent_offset, disk_bytenr, false); @@ -1723,6 +1720,7 @@ next_slot: WARN_ON_ONCE(freespace_inode); goto out_check; } + /* If the extent's block group is RO, we must COW */ if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) goto out_check; nocow = true; @@ -5212,7 +5210,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) return ret; } -static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) +static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -5221,7 +5220,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; @@ -5232,12 +5231,13 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid) { - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); inode_inc_iversion(inode); err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(inode, inode->i_mode); + err = posix_acl_chmod(&init_user_ns, inode, + inode->i_mode); } return err; @@ -6083,7 +6083,7 @@ static int btrfs_dirty_inode(struct inode *inode) return PTR_ERR(trans); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (ret && ret == -ENOSPC) { + if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); trans = btrfs_start_transaction(root, 1); @@ -6357,7 +6357,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (ret != 0) goto fail_unlock; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode_set_bytes(inode, 0); inode->i_mtime = current_time(inode); @@ -6518,8 +6518,8 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, return err; } -static int btrfs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; @@ -6582,8 +6582,8 @@ out_unlock: return err; } -static int btrfs_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; @@ -6727,7 +6727,8 @@ fail: return err; } -static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode = NULL; @@ -9017,7 +9018,8 @@ fail: return -ENOMEM; } -static int btrfs_getattr(const struct path *path, struct kstat *stat, +static int btrfs_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { u64 delalloc_bytes; @@ -9043,7 +9045,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -9534,9 +9536,9 @@ out_notrans: return ret; } -static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; @@ -9744,8 +9746,8 @@ out: return ret; } -static int btrfs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; @@ -10079,7 +10081,8 @@ static int btrfs_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } -static int btrfs_permission(struct inode *inode, int mask) +static int btrfs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct btrfs_root *root = BTRFS_I(inode)->root; umode_t mode = inode->i_mode; @@ -10091,10 +10094,11 @@ static int btrfs_permission(struct inode *inode, int mask) if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) return -EACCES; } - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } -static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; @@ -10194,6 +10198,7 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, sp->ptr = ptr; sp->inode = inode; sp->is_block_group = is_block_group; + sp->bg_extent_count = 1; spin_lock(&fs_info->swapfile_pins_lock); p = &fs_info->swapfile_pins.rb_node; @@ -10207,6 +10212,8 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, (sp->ptr == entry->ptr && sp->inode > entry->inode)) { p = &(*p)->rb_right; } else { + if (is_block_group) + entry->bg_extent_count++; spin_unlock(&fs_info->swapfile_pins_lock); kfree(sp); return 1; @@ -10232,8 +10239,11 @@ static void btrfs_free_swapfile_pins(struct inode *inode) sp = rb_entry(node, struct btrfs_swapfile_pin, node); if (sp->inode == inode) { rb_erase(&sp->node, &fs_info->swapfile_pins); - if (sp->is_block_group) + if (sp->is_block_group) { + btrfs_dec_block_group_swap_extents(sp->ptr, + sp->bg_extent_count); btrfs_put_block_group(sp->ptr); + } kfree(sp); } node = next; @@ -10294,7 +10304,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; struct extent_map *em = NULL; @@ -10345,13 +10356,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, "cannot activate swapfile while exclusive operation is running"); return -EBUSY; } + + /* + * Prevent snapshot creation while we are activating the swap file. + * We do not want to race with snapshot creation. If snapshot creation + * already started before we bumped nr_swapfiles from 0 to 1 and + * completes before the first write into the swap file after it is + * activated, than that write would fallback to COW. + */ + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) { + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because snapshot creation is in progress"); + return -EINVAL; + } /* * Snapshots can create extents which require COW even if NODATACOW is * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. */ - atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles); + atomic_inc(&root->nr_swapfiles); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); @@ -10448,6 +10473,17 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, goto out; } + if (!btrfs_inc_block_group_swap_extents(bg)) { + btrfs_warn(fs_info, + "block group for swapfile at %llu is read-only%s", + bg->start, + atomic_read(&fs_info->scrubs_running) ? + " (scrub running)" : ""); + btrfs_put_block_group(bg); + ret = -EINVAL; + goto out; + } + ret = btrfs_add_swapfile_pin(inode, bg, true); if (ret) { btrfs_put_block_group(bg); @@ -10486,6 +10522,8 @@ out: if (ret) btrfs_swap_deactivate(file); + btrfs_drew_write_unlock(&root->snapshot_lock); + btrfs_exclop_finish(fs_info); if (ret) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a8c60d46d19c..e8d53fea4c61 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -213,7 +213,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) const char *comp = NULL; u32 binode_flags; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; if (btrfs_root_readonly(root)) @@ -429,7 +429,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) unsigned old_i_flags; int ret = 0; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; if (btrfs_root_readonly(root)) @@ -925,13 +925,14 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || - IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) + if (check_sticky(&init_user_ns, dir, d_inode(victim)) || + IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) || + IS_SWAPFILE(d_inode(victim))) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -954,7 +955,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(dir, MAY_WRITE | MAY_EXEC); + return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); } /* @@ -1871,7 +1872,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; - } else if (!inode_owner_or_capable(src_inode)) { + } else if (!inode_owner_or_capable(&init_user_ns, src_inode)) { /* * Subvolume creation is not restricted, but snapshots * are limited to own subvolumes only @@ -1935,7 +1936,10 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { - if (vol_args->size > PAGE_SIZE) { + u64 nums; + + if (vol_args->size < sizeof(*inherit) || + vol_args->size > PAGE_SIZE) { ret = -EINVAL; goto free_args; } @@ -1944,6 +1948,20 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ret = PTR_ERR(inherit); goto free_args; } + + if (inherit->num_qgroups > PAGE_SIZE || + inherit->num_ref_copies > PAGE_SIZE || + inherit->num_excl_copies > PAGE_SIZE) { + ret = -EINVAL; + goto free_inherit; + } + + nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + + 2 * inherit->num_excl_copies; + if (vol_args->size != struct_size(inherit, qgroups, nums)) { + ret = -EINVAL; + goto free_inherit; + } } ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, @@ -1991,7 +2009,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, u64 flags; int ret = 0; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -2547,7 +2565,8 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC); + ret = inode_permission(&init_user_ns, temp_inode, + MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { ret = -EACCES; @@ -3077,7 +3096,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (root == dest) goto out_dput; - err = inode_permission(inode, MAY_WRITE | MAY_EXEC); + err = inode_permission(&init_user_ns, inode, + MAY_WRITE | MAY_EXEC); if (err) goto out_dput; } @@ -3148,7 +3168,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) * running and allows defrag on files open in read-only mode. */ if (!capable(CAP_SYS_ADMIN) && - inode_permission(inode, MAY_WRITE)) { + inode_permission(&init_user_ns, inode, MAY_WRITE)) { ret = -EPERM; goto out; } @@ -4460,7 +4480,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; int received_uuid_changed; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; ret = mnt_want_write_file(file); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index aa9cd11f4b78..9084a950dc09 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -467,7 +467,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes = min_t(unsigned long, destlen, out_len - start_byte); - kaddr = kmap_atomic(dest_page); + kaddr = kmap_local_page(dest_page); memcpy(kaddr, workspace->buf + start_byte, bytes); /* @@ -477,7 +477,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, */ if (bytes < destlen) memset(kaddr+bytes, 0, destlen-bytes); - kunmap_atomic(kaddr); + kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 808370ada888..14ff388fd3bd 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3841,8 +3841,8 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, return num_bytes; } -static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce) +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -3873,14 +3873,14 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, { int ret; - ret = qgroup_reserve_meta(root, num_bytes, type, enforce); + ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); if (ret <= 0 && ret != -EDQUOT) return ret; ret = try_flush_qgroup(root); if (ret < 0) return ret; - return qgroup_reserve_meta(root, num_bytes, type, enforce); + return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); } void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 50dea9a2d8fb..7283e4f549af 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -361,6 +361,8 @@ int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len); int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len); +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); /* Reserve metadata space for pertrans and prealloc type */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 8ec34ecb6d68..8c31357f08ed 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -249,8 +249,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) static void cache_rbio_pages(struct btrfs_raid_bio *rbio) { int i; - char *s; - char *d; int ret; ret = alloc_rbio_pages(rbio); @@ -261,13 +259,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) if (!rbio->bio_pages[i]) continue; - s = kmap(rbio->bio_pages[i]); - d = kmap(rbio->stripe_pages[i]); - - copy_page(d, s); - - kunmap(rbio->bio_pages[i]); - kunmap(rbio->stripe_pages[i]); + copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]); SetPageUptodate(rbio->stripe_pages[i]); } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -2359,16 +2351,21 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, SetPageUptodate(p_page); if (has_qstripe) { + /* RAID6, allocate and map temp space for the Q stripe */ q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!q_page) { __free_page(p_page); goto cleanup; } SetPageUptodate(q_page); + pointers[rbio->real_stripes - 1] = kmap(q_page); } atomic_set(&rbio->error, 0); + /* Map the parity stripe just once */ + pointers[nr_data] = kmap(p_page); + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { struct page *p; void *parity; @@ -2378,16 +2375,8 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, pointers[stripe] = kmap(p); } - /* then add the parity stripe */ - pointers[stripe++] = kmap(p_page); - if (has_qstripe) { - /* - * raid6, add the qstripe and call the - * library function to fill in our p/q - */ - pointers[stripe++] = kmap(q_page); - + /* RAID6, call the library function to fill in our P/Q */ raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, pointers); } else { @@ -2408,12 +2397,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, for (stripe = 0; stripe < nr_data; stripe++) kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); - kunmap(p_page); } + kunmap(p_page); __free_page(p_page); - if (q_page) + if (q_page) { + kunmap(q_page); __free_page(q_page); + } writeback: /* diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 2b490becbe67..8e026de74c44 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -218,11 +218,11 @@ static void __print_stack_trace(struct btrfs_fs_info *fs_info, stack_trace_print(ra->trace, ra->trace_len, 2); } #else -static void inline __save_stack_trace(struct ref_action *ra) +static inline void __save_stack_trace(struct ref_action *ra) { } -static void inline __print_stack_trace(struct btrfs_fs_info *fs_info, +static inline void __print_stack_trace(struct btrfs_fs_info *fs_info, struct ref_action *ra) { btrfs_err(fs_info, " ref-verify: no stacktrace support"); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index b24396cf2f99..762881b777b3 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -106,12 +106,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { - char *map; - - map = kmap(page); - memcpy(map, data_start, datal); + memcpy_to_page(page, 0, data_start, datal); flush_dcache_page(page); - kunmap(page); } else { ret = btrfs_decompress(comp_type, data_start, page, 0, inline_size, datal); @@ -553,6 +549,24 @@ process_slot: */ btrfs_release_path(path); + /* + * When using NO_HOLES and we are cloning a range that covers + * only a hole (no extents) into a range beyond the current + * i_size, punching a hole in the target range will not create + * an extent map defining a hole, because the range starts at or + * beyond current i_size. If the file previously had an i_size + * greater than the new i_size set by this clone operation, we + * need to make sure the next fsync is a full fsync, so that it + * detects and logs a hole covering a range from the current + * i_size to the new i_size. If the clone range covers extents, + * besides a hole, then we know the full sync flag was already + * set by previous calls to btrfs_replace_file_extents() that + * replaced file extent items. + */ + if (last_dest_end >= i_size_read(inode)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + ret = btrfs_replace_file_extents(inode, path, last_dest_end, destoff + len - 1, NULL, &trans); if (ret) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 582df11d298a..c2900ebf767a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3767,6 +3767,13 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * commit_transactions. */ ro_set = 0; + } else if (ret == -ETXTBSY) { + btrfs_warn(fs_info, + "skipping scrub of block group %llu due to active swapfile", + cache->start); + scrub_pause_off(fs_info); + ret = 0; + goto skip_unfreeze; } else { btrfs_warn(fs_info, "failed setting block group ro: %d", ret); @@ -3862,7 +3869,7 @@ done: } else { spin_unlock(&cache->lock); } - +skip_unfreeze: btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); if (ret) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f87878274e9f..8f323859156b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4932,7 +4932,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode; struct page *page; - char *addr; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); @@ -4985,10 +4984,8 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) } } - addr = kmap(page); - memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset, - cur_len); - kunmap(page); + memcpy_from_page(sctx->send_buf + sctx->send_size, page, + pg_offset, cur_len); unlock_page(page); put_page(page); index++; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f8435641b912..f7a4ad86adee 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1918,8 +1918,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); - if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) != - btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != + (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && (!sb_rdonly(sb) || (*flags & SB_RDONLY))) { btrfs_warn(fs_info, "remount supports changing free space tree only from ro to rw"); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 6bd97bd4cb37..3a4099a2bf05 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -62,7 +62,7 @@ struct inode *btrfs_new_test_inode(void) BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - inode_init_owner(inode, NULL, S_IFREG); + inode_init_owner(&init_user_ns, inode, NULL, S_IFREG); return inode; } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 582061c7b547..f4ade821307d 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1453,22 +1453,14 @@ static int check_extent_data_ref(struct extent_buffer *leaf, return -EUCLEAN; } for (; ptr < end; ptr += sizeof(*dref)) { - u64 root_objectid; - u64 owner; u64 offset; - u64 hash; + /* + * We cannot check the extent_data_ref hash due to possible + * overflow from the leaf due to hash collisions. + */ dref = (struct btrfs_extent_data_ref *)ptr; - root_objectid = btrfs_extent_data_ref_root(leaf, dref); - owner = btrfs_extent_data_ref_objectid(leaf, dref); offset = btrfs_extent_data_ref_offset(leaf, dref); - hash = hash_extent_data_ref(root_objectid, owner, offset); - if (unlikely(hash != key->offset)) { - extent_err(leaf, slot, - "invalid extent data ref hash, item has 0x%016llx key has 0x%016llx", - hash, key->offset); - return -EUCLEAN; - } if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) { extent_err(leaf, slot, "invalid extent data backref offset, have %llu expect aligned to %u", diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d90695c1ab6c..2f1acc9aea9e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3174,16 +3174,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root_log_ctx.log_transid = log_root_tree->log_transid; if (btrfs_is_zoned(fs_info)) { - mutex_lock(&fs_info->tree_root->log_mutex); if (!log_root_tree->node) { ret = btrfs_alloc_log_tree_node(trans, log_root_tree); if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); mutex_unlock(&log_root_tree->log_mutex); goto out; } } - mutex_unlock(&fs_info->tree_root->log_mutex); } /* diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index af6246f36a9e..8a4514283a4b 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -229,11 +229,33 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; + const bool start_trans = (current->journal_info == NULL); int ret; - trans = btrfs_start_transaction(root, 2); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (start_trans) { + /* + * 1 unit for inserting/updating/deleting the xattr + * 1 unit for the inode item update + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + } else { + /* + * This can happen when smack is enabled and a directory is being + * created. It happens through d_instantiate_new(), which calls + * smack_d_instantiate(), which in turn calls __vfs_setxattr() to + * set the transmute xattr (XATTR_NAME_SMACKTRANSMUTE) on the + * inode. We have already reserved space for the xattr and inode + * update at btrfs_mkdir(), so just use the transaction handle. + * We don't join or start a transaction, as that will reset the + * block_rsv of the handle and trigger a warning for the start + * case. + */ + ASSERT(strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) == 0); + trans = current->journal_info; + } ret = btrfs_setxattr(trans, inode, name, value, size, flags); if (ret) @@ -244,7 +266,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); BUG_ON(ret); out: - btrfs_end_transaction(trans); + if (start_trans) + btrfs_end_transaction(trans); return ret; } @@ -362,6 +385,7 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -371,6 +395,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 05615a1099db..d524acf7b3e5 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -432,9 +432,8 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, PAGE_SIZE - (buf_offset % PAGE_SIZE)); bytes = min(bytes, bytes_left); - kaddr = kmap_atomic(dest_page); - memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); - kunmap_atomic(kaddr); + memcpy_to_page(dest_page, pg_offset, + workspace->buf + buf_offset, bytes); pg_offset += bytes; bytes_left -= bytes; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index d0eb0c8d6269..1f972b75a9ab 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -269,7 +269,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) sector_t sector = 0; struct blk_zone *zones = NULL; unsigned int i, nreported = 0, nr_zones; - unsigned int zone_sectors; + sector_t zone_sectors; char *model, *emulated; int ret; @@ -658,7 +658,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, u64 *bytenr_ret) { struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; - unsigned int zone_sectors; + sector_t zone_sectors; u32 sb_zone; int ret; u8 zone_sectors_shift; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 9a4871636c6c..8e9626d63976 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -688,10 +688,8 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, bytes = min_t(unsigned long, destlen - pg_offset, workspace->out_buf.size - buf_offset); - kaddr = kmap_atomic(dest_page); - memcpy(kaddr + pg_offset, workspace->out_buf.dst + buf_offset, - bytes); - kunmap_atomic(kaddr); + memcpy_to_page(dest_page, pg_offset, + workspace->out_buf.dst + buf_offset, bytes); pg_offset += bytes; } diff --git a/fs/buffer.c b/fs/buffer.c index 32647d2011df..0cb7ffd4977c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -847,7 +847,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, if (retry) gfp |= __GFP_NOFAIL; - memcg = get_mem_cgroup_from_page(page); + /* The page lock pins the memcg */ + memcg = page_memcg(page); old_memcg = set_active_memcg(memcg); head = NULL; @@ -868,7 +869,6 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, } out: set_active_memcg(old_memcg); - mem_cgroup_put(memcg); return head; /* * In case anything failed, we just free everything we got. @@ -2083,7 +2083,8 @@ static int __block_commit_write(struct inode *inode, struct page *page, set_buffer_uptodate(bh); mark_buffer_dirty(bh); } - clear_buffer_new(bh); + if (buffer_new(bh)) + clear_buffer_new(bh); block_start = block_end; bh = bh->b_this_page; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 4cea5fbf695e..5efa6a3702c0 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -470,14 +470,14 @@ static int cachefiles_attr_changed(struct fscache_object *_object) _debug("discard tail %llx", oi_size); newattrs.ia_valid = ATTR_SIZE; newattrs.ia_size = oi_size & PAGE_MASK; - ret = notify_change(object->backer, &newattrs, NULL); + ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL); if (ret < 0) goto truncate_failed; } newattrs.ia_valid = ATTR_SIZE; newattrs.ia_size = ni_size; - ret = notify_change(object->backer, &newattrs, NULL); + ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL); truncate_failed: inode_unlock(d_inode(object->backer)); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index ecc8ecbbfa5a..7bf0732ae25c 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -311,7 +311,8 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, cachefiles_io_error(cache, "Unlink security error"); } else { trace_cachefiles_unlink(object, rep, why); - ret = vfs_unlink(d_inode(dir), rep, NULL); + ret = vfs_unlink(&init_user_ns, d_inode(dir), rep, + NULL); if (preemptive) cachefiles_mark_object_buried(cache, rep, why); @@ -412,9 +413,16 @@ try_again: if (ret < 0) { cachefiles_io_error(cache, "Rename security error %d", ret); } else { + struct renamedata rd = { + .old_mnt_userns = &init_user_ns, + .old_dir = d_inode(dir), + .old_dentry = rep, + .new_mnt_userns = &init_user_ns, + .new_dir = d_inode(cache->graveyard), + .new_dentry = grave, + }; trace_cachefiles_rename(object, rep, grave, why); - ret = vfs_rename(d_inode(dir), rep, - d_inode(cache->graveyard), grave, NULL, 0); + ret = vfs_rename(&rd); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); @@ -561,7 +569,7 @@ lookup_again: if (ret < 0) goto create_error; start = jiffies; - ret = vfs_mkdir(d_inode(dir), next, 0); + ret = vfs_mkdir(&init_user_ns, d_inode(dir), next, 0); cachefiles_hist(cachefiles_mkdir_histogram, start); if (!key) trace_cachefiles_mkdir(object, next, ret); @@ -597,7 +605,8 @@ lookup_again: if (ret < 0) goto create_error; start = jiffies; - ret = vfs_create(d_inode(dir), next, S_IFREG, true); + ret = vfs_create(&init_user_ns, d_inode(dir), next, + S_IFREG, true); cachefiles_hist(cachefiles_create_histogram, start); trace_cachefiles_create(object, next, ret); if (ret < 0) @@ -791,7 +800,7 @@ retry: ret = security_path_mkdir(&path, subdir, 0700); if (ret < 0) goto mkdir_error; - ret = vfs_mkdir(d_inode(dir), subdir, 0700); + ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700); if (ret < 0) goto mkdir_error; diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 72e42438f3d7..a591b5e09637 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -39,8 +39,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object) _enter("%p{%s}", object, type); /* attempt to install a type label directly */ - ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2, - XATTR_CREATE); + ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, type, + 2, XATTR_CREATE); if (ret == 0) { _debug("SET"); /* we succeeded */ goto error; @@ -54,7 +54,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object) } /* read the current type label */ - ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3); + ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, xtype, + 3); if (ret < 0) { if (ret == -ERANGE) goto bad_type_length; @@ -110,9 +111,8 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object, _debug("SET #%u", auxdata->len); clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); - ret = vfs_setxattr(dentry, cachefiles_xattr_cache, - &auxdata->type, auxdata->len, - XATTR_CREATE); + ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, XATTR_CREATE); if (ret < 0 && ret != -ENOMEM) cachefiles_io_error_obj( object, @@ -140,9 +140,8 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, _debug("SET #%u", auxdata->len); clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); - ret = vfs_setxattr(dentry, cachefiles_xattr_cache, - &auxdata->type, auxdata->len, - XATTR_REPLACE); + ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, XATTR_REPLACE); if (ret < 0 && ret != -ENOMEM) cachefiles_io_error_obj( object, @@ -171,7 +170,7 @@ int cachefiles_check_auxdata(struct cachefiles_object *object) if (!auxbuf) return -ENOMEM; - xlen = vfs_getxattr(dentry, cachefiles_xattr_cache, + xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, &auxbuf->type, 512 + 1); ret = -ESTALE; if (xlen < 1 || @@ -213,7 +212,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, } /* read the current type label */ - ret = vfs_getxattr(dentry, cachefiles_xattr_cache, + ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, &auxbuf->type, 512 + 1); if (ret < 0) { if (ret == -ENODATA) @@ -270,9 +269,9 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, } /* update the current label */ - ret = vfs_setxattr(dentry, cachefiles_xattr_cache, - &auxdata->type, auxdata->len, - XATTR_REPLACE); + ret = vfs_setxattr(&init_user_ns, dentry, + cachefiles_xattr_cache, &auxdata->type, + auxdata->len, XATTR_REPLACE); if (ret < 0) { cachefiles_io_error_obj(object, "Can't update xattr on %lu" @@ -309,7 +308,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, { int ret; - ret = vfs_removexattr(dentry, cachefiles_xattr_cache); + ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache); if (ret < 0) { if (ret == -ENOENT || ret == -ENODATA) ret = 0; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index e0465741c591..529af59d9fd3 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -82,7 +82,8 @@ retry: return acl; } -int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int ret = 0, size = 0; const char *name = NULL; @@ -100,7 +101,8 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - ret = posix_acl_update_mode(inode, &new_mode, &acl); + ret = posix_acl_update_mode(&init_user_ns, inode, + &new_mode, &acl); if (ret) goto out; } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 858ee7362ff5..83d9358854fb 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -823,8 +823,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) return PTR_ERR(result); } -static int ceph_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; @@ -878,14 +878,14 @@ out: return err; } -static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int ceph_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return ceph_mknod(dir, dentry, mode, 0); + return ceph_mknod(mnt_userns, dir, dentry, mode, 0); } -static int ceph_symlink(struct inode *dir, struct dentry *dentry, - const char *dest) +static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *dest) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; @@ -937,7 +937,8 @@ out: return err; } -static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; @@ -1183,9 +1184,9 @@ out: return err; } -static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb); struct ceph_mds_request *req; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5d20a620e96c..156f849f5385 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2201,7 +2201,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) /* * setattr */ -int ceph_setattr(struct dentry *dentry, struct iattr *attr) +int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -2210,7 +2211,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err != 0) return err; @@ -2225,7 +2226,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) err = __ceph_setattr(inode, attr); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) - err = posix_acl_chmod(inode, attr->ia_mode); + err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode); return err; } @@ -2284,7 +2285,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, * Check inode permissions. We verify we have a valid value for * the AUTH cap, then call the generic handler. */ -int ceph_permission(struct inode *inode, int mask) +int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { int err; @@ -2294,7 +2296,7 @@ int ceph_permission(struct inode *inode, int mask) err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); if (!err) - err = generic_permission(inode, mask); + err = generic_permission(&init_user_ns, inode, mask); return err; } @@ -2331,8 +2333,8 @@ static int statx_to_caps(u32 want, umode_t mode) * Get all the attributes. If we have sufficient caps for the requested attrs, * then we can avoid talking to the MDS at all. */ -int ceph_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct ceph_inode_info *ci = ceph_inode(inode); @@ -2348,7 +2350,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat, return err; } - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->ino = ceph_present_inode(inode); /* diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 13b02887b085..c48bb30c8d70 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1000,10 +1000,13 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) { return __ceph_do_getattr(inode, NULL, mask, force); } -extern int ceph_permission(struct inode *inode, int mask); +extern int ceph_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); extern int __ceph_setattr(struct inode *inode, struct iattr *attr); -extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); -extern int ceph_getattr(const struct path *path, struct kstat *stat, +extern int ceph_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr); +extern int ceph_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); /* xattr.c */ @@ -1064,7 +1067,8 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); #ifdef CONFIG_CEPH_FS_POSIX_ACL struct posix_acl *ceph_get_acl(struct inode *, int); -int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ceph_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); void ceph_init_inode_acls(struct inode *inode, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 24997982de01..02f59bcb4f27 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1238,6 +1238,7 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler, } static int ceph_set_xattr_handler(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index aa697ccfa9dc..3aedc484e440 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -133,11 +133,12 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) { struct TCP_Server_Info *server = chan->server; - seq_printf(m, "\t\tChannel %d Number of credits: %d Dialect 0x%x " - "TCP status: %d Instance: %d Local Users To Server: %d " - "SecMode: 0x%x Req On Wire: %d In Send: %d " - "In MaxReq Wait: %d\n", - i+1, + seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx" + "\n\t\tNumber of credits: %d Dialect 0x%x" + "\n\t\tTCP status: %d Instance: %d" + "\n\t\tLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d" + "\n\t\tIn Send: %d In MaxReq Wait: %d", + i+1, server->conn_id, server->credits, server->dialect, server->tcpStatus, @@ -227,7 +228,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; - int i, j; + int c, i, j; seq_puts(m, "Display Internal CIFS Data Structures for Debugging\n" @@ -275,14 +276,25 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_putc(m, '\n'); seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); - seq_printf(m, "Servers:"); - i = 0; + seq_printf(m, "\nServers: "); + + c = 0; spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + /* channel info will be printed as a part of sessions below */ + if (server->is_channel) + continue; + + c++; + seq_printf(m, "\n%d) ConnectionId: 0x%llx ", + c, server->conn_id); + + if (server->hostname) + seq_printf(m, "Hostname: %s ", server->hostname); #ifdef CONFIG_CIFS_SMB_DIRECT if (!server->rdma) goto skip_rdma; @@ -362,46 +374,48 @@ skip_rdma: if (server->posix_ext_supported) seq_printf(m, " posix"); - i++; + if (server->rdma) + seq_printf(m, "\nRDMA "); + seq_printf(m, "\nTCP status: %d Instance: %d" + "\nLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d", + server->tcpStatus, + server->reconnect_instance, + server->srv_count, + server->sec_mode, in_flight(server)); + + seq_printf(m, "\nIn Send: %d In MaxReq Wait: %d", + atomic_read(&server->in_send), + atomic_read(&server->num_waiters)); + + seq_printf(m, "\n\n\tSessions: "); + i = 0; list_for_each(tmp2, &server->smb_ses_list) { ses = list_entry(tmp2, struct cifs_ses, smb_ses_list); + i++; if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || (ses->serverNOS == NULL)) { - seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d ", - i, ses->serverName, ses->ses_count, + seq_printf(m, "\n\t%d) Address: %s Uses: %d Capability: 0x%x\tSession Status: %d ", + i, ses->ip_addr, ses->ses_count, ses->capabilities, ses->status); if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) - seq_printf(m, "Guest\t"); + seq_printf(m, "Guest "); else if (ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) - seq_printf(m, "Anonymous\t"); + seq_printf(m, "Anonymous "); } else { seq_printf(m, - "\n%d) Name: %s Domain: %s Uses: %d OS:" - " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB" - " session status: %d ", - i, ses->serverName, ses->serverDomain, + "\n\t%d) Name: %s Domain: %s Uses: %d OS: %s " + "\n\tNOS: %s\tCapability: 0x%x" + "\n\tSMB session status: %d ", + i, ses->ip_addr, ses->serverDomain, ses->ses_count, ses->serverOS, ses->serverNOS, ses->capabilities, ses->status); } - seq_printf(m,"Security type: %s\n", + seq_printf(m, "\n\tSecurity type: %s ", get_security_type_str(server->ops->select_sectype(server, ses->sectype))); - if (server->rdma) - seq_printf(m, "RDMA\n\t"); - seq_printf(m, "TCP status: %d Instance: %d\n\tLocal Users To " - "Server: %d SecMode: 0x%x Req On Wire: %d", - server->tcpStatus, - server->reconnect_instance, - server->srv_count, - server->sec_mode, in_flight(server)); - - seq_printf(m, " In Send: %d In MaxReq Wait: %d", - atomic_read(&server->in_send), - atomic_read(&server->num_waiters)); - /* dump session id helpful for use with network trace */ seq_printf(m, " SessionId: 0x%llx", ses->Suid); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) @@ -414,13 +428,13 @@ skip_rdma: from_kuid(&init_user_ns, ses->cred_uid)); if (ses->chan_count > 1) { - seq_printf(m, "\n\n\tExtra Channels: %zu\n", + seq_printf(m, "\n\n\tExtra Channels: %zu ", ses->chan_count-1); for (j = 1; j < ses->chan_count; j++) cifs_dump_channel(m, j, &ses->chans[j]); } - seq_puts(m, "\n\n\tShares:"); + seq_puts(m, "\n\n\tShares: "); j = 0; seq_printf(m, "\n\t%d) IPC: ", j); @@ -437,38 +451,43 @@ skip_rdma: cifs_debug_tcon(m, tcon); } - seq_puts(m, "\n\tMIDs:\n"); - - spin_lock(&GlobalMid_Lock); - list_for_each(tmp3, &server->pending_mid_q) { - mid_entry = list_entry(tmp3, struct mid_q_entry, - qhead); - seq_printf(m, "\tState: %d com: %d pid:" - " %d cbdata: %p mid %llu\n", - mid_entry->mid_state, - le16_to_cpu(mid_entry->command), - mid_entry->pid, - mid_entry->callback_data, - mid_entry->mid); - } - spin_unlock(&GlobalMid_Lock); - spin_lock(&ses->iface_lock); if (ses->iface_count) - seq_printf(m, "\n\tServer interfaces: %zu\n", + seq_printf(m, "\n\n\tServer interfaces: %zu", ses->iface_count); for (j = 0; j < ses->iface_count; j++) { struct cifs_server_iface *iface; iface = &ses->iface_list[j]; - seq_printf(m, "\t%d)", j); + seq_printf(m, "\n\t%d)", j+1); cifs_dump_iface(m, iface); if (is_ses_using_iface(ses, iface)) seq_puts(m, "\t\t[CONNECTED]\n"); } spin_unlock(&ses->iface_lock); } + if (i == 0) + seq_printf(m, "\n\t\t[NONE]"); + + seq_puts(m, "\n\n\tMIDs: "); + spin_lock(&GlobalMid_Lock); + list_for_each(tmp3, &server->pending_mid_q) { + mid_entry = list_entry(tmp3, struct mid_q_entry, + qhead); + seq_printf(m, "\n\tState: %d com: %d pid:" + " %d cbdata: %p mid %llu\n", + mid_entry->mid_state, + le16_to_cpu(mid_entry->command), + mid_entry->pid, + mid_entry->callback_data, + mid_entry->mid); + } + spin_unlock(&GlobalMid_Lock); + seq_printf(m, "\n--\n"); } + if (c == 0) + seq_printf(m, "\n\t[NONE]"); + spin_unlock(&cifs_tcp_ses_lock); seq_putc(m, '\n'); diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index d35f599aa00e..f2d730fffccb 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -272,7 +272,7 @@ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon) if (IS_ERR(share_name)) { int ret; - ret = PTR_ERR(net_name); + ret = PTR_ERR(share_name); cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n", __func__, tcon->treeName, ret); kfree(net_name); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 562913e2b3f2..9d29eb9660c2 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -267,10 +267,11 @@ is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group) return true; /* well known sid found, uid returned */ } -static void +static __u16 cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) { int i; + __u16 size = 1 + 1 + 6; dst->revision = src->revision; dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES); @@ -278,6 +279,9 @@ cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) dst->authority[i] = src->authority[i]; for (i = 0; i < dst->num_subauth; ++i) dst->sub_auth[i] = src->sub_auth[i]; + size += (dst->num_subauth * 4); + + return size; } static int @@ -521,8 +525,11 @@ exit_cifs_idmap(void) } /* copy ntsd, owner sid, and group sid from a security descriptor to another */ -static void copy_sec_desc(const struct cifs_ntsd *pntsd, - struct cifs_ntsd *pnntsd, __u32 sidsoffset) +static __u32 copy_sec_desc(const struct cifs_ntsd *pntsd, + struct cifs_ntsd *pnntsd, + __u32 sidsoffset, + struct cifs_sid *pownersid, + struct cifs_sid *pgrpsid) { struct cifs_sid *owner_sid_ptr, *group_sid_ptr; struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; @@ -536,19 +543,25 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd, pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid)); /* copy owner sid */ - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + if (pownersid) + owner_sid_ptr = pownersid; + else + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr); /* copy group sid */ - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + if (pgrpsid) + group_sid_ptr = pgrpsid; + else + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + sizeof(struct cifs_sid)); cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr); - return; + return sidsoffset + (2 * sizeof(struct cifs_sid)); } @@ -663,6 +676,25 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, return; } +static __u16 cifs_copy_ace(struct cifs_ace *dst, struct cifs_ace *src, struct cifs_sid *psid) +{ + __u16 size = 1 + 1 + 2 + 4; + + dst->type = src->type; + dst->flags = src->flags; + dst->access_req = src->access_req; + + /* Check if there's a replacement sid specified */ + if (psid) + size += cifs_copy_sid(&dst->sid, psid); + else + size += cifs_copy_sid(&dst->sid, &src->sid); + + dst->size = cpu_to_le16(size); + + return size; +} + static __u16 fill_ace_for_sid(struct cifs_ace *pntace, const struct cifs_sid *psid, __u64 nmode, umode_t bits, __u8 access_type, @@ -907,29 +939,30 @@ unsigned int setup_special_user_owner_ACE(struct cifs_ace *pntace) return ace_size; } -static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, - struct cifs_sid *pgrpsid, __u64 *pnmode, bool modefromsid) +static void populate_new_aces(char *nacl_base, + struct cifs_sid *pownersid, + struct cifs_sid *pgrpsid, + __u64 *pnmode, u32 *pnum_aces, u16 *pnsize, + bool modefromsid) { - u16 size = 0; - u32 num_aces = 0; - struct cifs_acl *pnndacl; __u64 nmode; + u32 num_aces = 0; + u16 nsize = 0; __u64 user_mode; __u64 group_mode; __u64 other_mode; __u64 deny_user_mode = 0; __u64 deny_group_mode = 0; bool sticky_set = false; - - pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl)); + struct cifs_ace *pnntace = NULL; nmode = *pnmode; + num_aces = *pnum_aces; + nsize = *pnsize; if (modefromsid) { - struct cifs_ace *pntace = - (struct cifs_ace *)((char *)pnndacl + size); - - size += setup_special_mode_ACE(pntace, nmode); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += setup_special_mode_ACE(pnntace, nmode); num_aces++; goto set_size; } @@ -966,40 +999,170 @@ static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, sticky_set = true; if (deny_user_mode) { - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pownersid, deny_user_mode, 0700, ACCESS_DENIED, false); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pownersid, deny_user_mode, + 0700, ACCESS_DENIED, false); num_aces++; } + /* Group DENY ACE does not conflict with owner ALLOW ACE. Keep in preferred order*/ if (deny_group_mode && !(deny_group_mode & (user_mode >> 3))) { - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode, + 0070, ACCESS_DENIED, false); num_aces++; } - size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size), - pownersid, user_mode, 0700, ACCESS_ALLOWED, true); + + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pownersid, user_mode, + 0700, ACCESS_ALLOWED, true); num_aces++; + /* Group DENY ACE conflicts with owner ALLOW ACE. So keep it after. */ if (deny_group_mode && (deny_group_mode & (user_mode >> 3))) { - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode, + 0070, ACCESS_DENIED, false); num_aces++; } - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pgrpsid, group_mode, 0070, ACCESS_ALLOWED, !sticky_set); + + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pgrpsid, group_mode, + 0070, ACCESS_ALLOWED, !sticky_set); num_aces++; - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - &sid_everyone, other_mode, 0007, ACCESS_ALLOWED, !sticky_set); + + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, &sid_everyone, other_mode, + 0007, ACCESS_ALLOWED, !sticky_set); num_aces++; set_size: + *pnum_aces = num_aces; + *pnsize = nsize; +} + +static __u16 replace_sids_and_copy_aces(struct cifs_acl *pdacl, struct cifs_acl *pndacl, + struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, + struct cifs_sid *pnownersid, struct cifs_sid *pngrpsid) +{ + int i; + u16 size = 0; + struct cifs_ace *pntace = NULL; + char *acl_base = NULL; + u32 src_num_aces = 0; + u16 nsize = 0; + struct cifs_ace *pnntace = NULL; + char *nacl_base = NULL; + u16 ace_size = 0; + + acl_base = (char *)pdacl; + size = sizeof(struct cifs_acl); + src_num_aces = le32_to_cpu(pdacl->num_aces); + + nacl_base = (char *)pndacl; + nsize = sizeof(struct cifs_acl); + + /* Go through all the ACEs */ + for (i = 0; i < src_num_aces; ++i) { + pntace = (struct cifs_ace *) (acl_base + size); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + + if (pnownersid && compare_sids(&pntace->sid, pownersid) == 0) + ace_size = cifs_copy_ace(pnntace, pntace, pnownersid); + else if (pngrpsid && compare_sids(&pntace->sid, pgrpsid) == 0) + ace_size = cifs_copy_ace(pnntace, pntace, pngrpsid); + else + ace_size = cifs_copy_ace(pnntace, pntace, NULL); + + size += le16_to_cpu(pntace->size); + nsize += ace_size; + } + + return nsize; +} + +static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, + struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, + __u64 *pnmode, bool mode_from_sid) +{ + int i; + u16 size = 0; + struct cifs_ace *pntace = NULL; + char *acl_base = NULL; + u32 src_num_aces = 0; + u16 nsize = 0; + struct cifs_ace *pnntace = NULL; + char *nacl_base = NULL; + u32 num_aces = 0; + __u64 nmode; + bool new_aces_set = false; + + /* Assuming that pndacl and pnmode are never NULL */ + nmode = *pnmode; + nacl_base = (char *)pndacl; + nsize = sizeof(struct cifs_acl); + + /* If pdacl is NULL, we don't have a src. Simply populate new ACL. */ + if (!pdacl) { + populate_new_aces(nacl_base, + pownersid, pgrpsid, + pnmode, &num_aces, &nsize, + mode_from_sid); + goto finalize_dacl; + } + + acl_base = (char *)pdacl; + size = sizeof(struct cifs_acl); + src_num_aces = le32_to_cpu(pdacl->num_aces); + + /* Retain old ACEs which we can retain */ + for (i = 0; i < src_num_aces; ++i) { + pntace = (struct cifs_ace *) (acl_base + size); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + + if (!new_aces_set && (pntace->flags & INHERITED_ACE)) { + /* Place the new ACEs in between existing explicit and inherited */ + populate_new_aces(nacl_base, + pownersid, pgrpsid, + pnmode, &num_aces, &nsize, + mode_from_sid); + + new_aces_set = true; + } + + /* If it's any one of the ACE we're replacing, skip! */ + if ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || + (compare_sids(&pntace->sid, pownersid) == 0) || + (compare_sids(&pntace->sid, pgrpsid) == 0) || + (compare_sids(&pntace->sid, &sid_everyone) == 0) || + (compare_sids(&pntace->sid, &sid_authusers) == 0)) { + goto next_ace; + } + + nsize += cifs_copy_ace(pnntace, pntace, NULL); + num_aces++; + +next_ace: + size += le16_to_cpu(pntace->size); + } + + /* If inherited ACEs are not present, place the new ones at the tail */ + if (!new_aces_set) { + populate_new_aces(nacl_base, + pownersid, pgrpsid, + pnmode, &num_aces, &nsize, + mode_from_sid); + + new_aces_set = true; + } + +finalize_dacl: pndacl->num_aces = cpu_to_le32(num_aces); - pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl)); + pndacl->size = cpu_to_le16(nsize); return 0; } - static int parse_sid(struct cifs_sid *psid, char *end_of_acl) { /* BB need to add parm so we can store the SID BB */ @@ -1094,7 +1257,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, - __u32 secdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid, + __u32 secdesclen, __u32 *pnsecdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid, bool mode_from_sid, bool id_from_sid, int *aclflag) { int rc = 0; @@ -1102,39 +1265,59 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, __u32 ndacloffset; __u32 sidsoffset; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; - struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; + struct cifs_sid *nowner_sid_ptr = NULL, *ngroup_sid_ptr = NULL; struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ + char *end_of_acl = ((char *)pntsd) + secdesclen; + u16 size = 0; - if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + - le32_to_cpu(pntsd->osidoffset)); - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + - le32_to_cpu(pntsd->gsidoffset)); - dacloffset = le32_to_cpu(pntsd->dacloffset); + dacloffset = le32_to_cpu(pntsd->dacloffset); + if (dacloffset) { dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + if (end_of_acl < (char *)dacl_ptr + le16_to_cpu(dacl_ptr->size)) { + cifs_dbg(VFS, "Server returned illegal ACL size\n"); + return -EINVAL; + } + } + + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); + + if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ ndacloffset = sizeof(struct cifs_ntsd); ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); - ndacl_ptr->revision = dacl_ptr->revision; - ndacl_ptr->size = 0; - ndacl_ptr->num_aces = 0; + ndacl_ptr->revision = + dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); - rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, + ndacl_ptr->size = cpu_to_le16(0); + ndacl_ptr->num_aces = cpu_to_le32(0); + + rc = set_chmod_dacl(dacl_ptr, ndacl_ptr, owner_sid_ptr, group_sid_ptr, pnmode, mode_from_sid); + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); - /* copy sec desc control portion & owner and group sids */ - copy_sec_desc(pntsd, pnntsd, sidsoffset); - *aclflag = CIFS_ACL_DACL; + /* copy the non-dacl portion of secdesc */ + *pnsecdesclen = copy_sec_desc(pntsd, pnntsd, sidsoffset, + NULL, NULL); + + *aclflag |= CIFS_ACL_DACL; } else { - memcpy(pnntsd, pntsd, secdesclen); + ndacloffset = sizeof(struct cifs_ntsd); + ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); + ndacl_ptr->revision = + dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); + ndacl_ptr->num_aces = dacl_ptr->num_aces; + if (uid_valid(uid)) { /* chown */ uid_t id; - owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + - le32_to_cpu(pnntsd->osidoffset)); nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), GFP_KERNEL); - if (!nowner_sid_ptr) - return -ENOMEM; + if (!nowner_sid_ptr) { + rc = -ENOMEM; + goto chown_chgrp_exit; + } id = from_kuid(&init_user_ns, uid); if (id_from_sid) { struct owner_sid *osid = (struct owner_sid *)nowner_sid_ptr; @@ -1145,27 +1328,25 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, osid->SubAuthorities[0] = cpu_to_le32(88); osid->SubAuthorities[1] = cpu_to_le32(1); osid->SubAuthorities[2] = cpu_to_le32(id); + } else { /* lookup sid with upcall */ rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr); if (rc) { cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n", __func__, rc, id); - kfree(nowner_sid_ptr); - return rc; + goto chown_chgrp_exit; } } - cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr); - kfree(nowner_sid_ptr); - *aclflag = CIFS_ACL_OWNER; + *aclflag |= CIFS_ACL_OWNER; } if (gid_valid(gid)) { /* chgrp */ gid_t id; - group_sid_ptr = (struct cifs_sid *)((char *)pnntsd + - le32_to_cpu(pnntsd->gsidoffset)); ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), GFP_KERNEL); - if (!ngroup_sid_ptr) - return -ENOMEM; + if (!ngroup_sid_ptr) { + rc = -ENOMEM; + goto chown_chgrp_exit; + } id = from_kgid(&init_user_ns, gid); if (id_from_sid) { struct owner_sid *gsid = (struct owner_sid *)ngroup_sid_ptr; @@ -1176,19 +1357,35 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, gsid->SubAuthorities[0] = cpu_to_le32(88); gsid->SubAuthorities[1] = cpu_to_le32(2); gsid->SubAuthorities[2] = cpu_to_le32(id); + } else { /* lookup sid with upcall */ rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr); if (rc) { cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n", __func__, rc, id); - kfree(ngroup_sid_ptr); - return rc; + goto chown_chgrp_exit; } } - cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr); - kfree(ngroup_sid_ptr); - *aclflag = CIFS_ACL_GROUP; + *aclflag |= CIFS_ACL_GROUP; + } + + if (dacloffset) { + /* Replace ACEs for old owner with new one */ + size = replace_sids_and_copy_aces(dacl_ptr, ndacl_ptr, + owner_sid_ptr, group_sid_ptr, + nowner_sid_ptr, ngroup_sid_ptr); + ndacl_ptr->size = cpu_to_le16(size); } + + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); + /* copy the non-dacl portion of secdesc */ + *pnsecdesclen = copy_sec_desc(pntsd, pnntsd, sidsoffset, + nowner_sid_ptr, ngroup_sid_ptr); + +chown_chgrp_exit: + /* errors could jump here. So make sure we return soon after this */ + kfree(nowner_sid_ptr); + kfree(ngroup_sid_ptr); } return rc; @@ -1384,6 +1581,9 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, int rc = 0; int aclflag = CIFS_ACL_DACL; /* default flag to set */ __u32 secdesclen = 0; + __u32 nsecdesclen = 0; + __u32 dacloffset = 0; + struct cifs_acl *dacl_ptr = NULL; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -1414,31 +1614,52 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, return rc; } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) + mode_from_sid = true; + else + mode_from_sid = false; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) + id_from_sid = true; + else + id_from_sid = false; + + /* Potentially, five new ACEs can be added to the ACL for U,G,O mapping */ + nsecdesclen = secdesclen; + if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ + if (mode_from_sid) + nsecdesclen += sizeof(struct cifs_ace); + else /* cifsacl */ + nsecdesclen += 5 * sizeof(struct cifs_ace); + } else { /* chown */ + /* When ownership changes, changes new owner sid length could be different */ + nsecdesclen = sizeof(struct cifs_ntsd) + (sizeof(struct cifs_sid) * 2); + dacloffset = le32_to_cpu(pntsd->dacloffset); + if (dacloffset) { + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + if (mode_from_sid) + nsecdesclen += + le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct cifs_ace); + else /* cifsacl */ + nsecdesclen += le16_to_cpu(dacl_ptr->size); + } + } + /* * Add three ACEs for owner, group, everyone getting rid of other ACEs * as chmod disables ACEs and set the security descriptor. Allocate * memory for the smb header, set security descriptor request security * descriptor parameters, and secuirty descriptor itself */ - secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN); - pnntsd = kmalloc(secdesclen, GFP_KERNEL); + nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN); + pnntsd = kmalloc(nsecdesclen, GFP_KERNEL); if (!pnntsd) { kfree(pntsd); cifs_put_tlink(tlink); return -ENOMEM; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) - mode_from_sid = true; - else - mode_from_sid = false; - - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) - id_from_sid = true; - else - id_from_sid = false; - - rc = build_sec_desc(pntsd, pnntsd, secdesclen, pnmode, uid, gid, + rc = build_sec_desc(pntsd, pnntsd, secdesclen, &nsecdesclen, pnmode, uid, gid, mode_from_sid, id_from_sid, &aclflag); cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); @@ -1448,7 +1669,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, if (!rc) { /* Set the security descriptor */ - rc = ops->set_acl(pnntsd, secdesclen, inode, path, aclflag); + rc = ops->set_acl(pnntsd, nsecdesclen, inode, path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } cifs_put_tlink(tlink); diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index ff7fd0862e28..d9e704979d99 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -31,8 +31,8 @@ #define EXEC_BIT 0x1 #define ACL_OWNER_MASK 0700 -#define ACL_GROUP_MASK 0770 -#define ACL_EVERYONE_MASK 0777 +#define ACL_GROUP_MASK 0070 +#define ACL_EVERYONE_MASK 0007 #define UBITSHIFT 6 #define GBITSHIFT 3 diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 51d53e4bdf6b..b8f1ff9a83f3 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -568,15 +568,15 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, return rc; } } else { - /* We use ses->serverName if no domain name available */ - len = strlen(ses->serverName); + /* We use ses->ip_addr if no domain name available */ + len = strlen(ses->ip_addr); server = kmalloc(2 + (len * 2), GFP_KERNEL); if (server == NULL) { rc = -ENOMEM; return rc; } - len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len, + len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp); rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ab883e84e116..d43e935d2df4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -305,7 +305,8 @@ static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) return -EOPNOTSUPP; } -static int cifs_permission(struct inode *inode, int mask) +static int cifs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct cifs_sb_info *cifs_sb; @@ -320,7 +321,7 @@ static int cifs_permission(struct inode *inode, int mask) on the client (above and beyond ACL on servers) for servers which do not support setting and viewing mode bits, so allowing client to check permissions is useful */ - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } static struct kmem_cache *cifs_inode_cachep; @@ -637,8 +638,18 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); if (tcon->handle_timeout) seq_printf(s, ",handletimeout=%u", tcon->handle_timeout); - /* convert actimeo and display it in seconds */ - seq_printf(s, ",actimeo=%lu", cifs_sb->ctx->actimeo / HZ); + + /* + * Display file and directory attribute timeout in seconds. + * If file and directory attribute timeout the same then actimeo + * was likely specified on mount + */ + if (cifs_sb->ctx->acdirmax == cifs_sb->ctx->acregmax) + seq_printf(s, ",actimeo=%lu", cifs_sb->ctx->acregmax / HZ); + else { + seq_printf(s, ",acdirmax=%lu", cifs_sb->ctx->acdirmax / HZ); + seq_printf(s, ",acregmax=%lu", cifs_sb->ctx->acregmax / HZ); + } if (tcon->ses->chan_max > 1) seq_printf(s, ",multichannel,max_channels=%zu", @@ -1525,6 +1536,7 @@ init_cifs(void) */ atomic_set(&sesInfoAllocCount, 0); atomic_set(&tconInfoAllocCount, 0); + atomic_set(&tcpSesNextId, 0); atomic_set(&tcpSesAllocCount, 0); atomic_set(&tcpSesReconnectCount, 0); atomic_set(&tconInfoReconnectCount, 0); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 2307bb0f6147..0d7ef150dbb2 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -62,19 +62,22 @@ extern void cifs_sb_deactive(struct super_block *sb); /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); -extern int cifs_create(struct inode *, struct dentry *, umode_t, - bool excl); +extern int cifs_create(struct user_namespace *, struct inode *, + struct dentry *, umode_t, bool excl); extern int cifs_atomic_open(struct inode *, struct dentry *, struct file *, unsigned, umode_t); extern struct dentry *cifs_lookup(struct inode *, struct dentry *, unsigned int); extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); -extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t); -extern int cifs_mkdir(struct inode *, struct dentry *, umode_t); +extern int cifs_mknod(struct user_namespace *, struct inode *, struct dentry *, + umode_t, dev_t); +extern int cifs_mkdir(struct user_namespace *, struct inode *, struct dentry *, + umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); -extern int cifs_rename2(struct inode *, struct dentry *, struct inode *, - struct dentry *, unsigned int); +extern int cifs_rename2(struct user_namespace *, struct inode *, + struct dentry *, struct inode *, struct dentry *, + unsigned int); extern int cifs_revalidate_file_attr(struct file *filp); extern int cifs_revalidate_dentry_attr(struct dentry *); extern int cifs_revalidate_file(struct file *filp); @@ -82,8 +85,10 @@ extern int cifs_revalidate_dentry(struct dentry *); extern int cifs_invalidate_mapping(struct inode *inode); extern int cifs_revalidate_mapping(struct inode *inode); extern int cifs_zap_mapping(struct inode *inode); -extern int cifs_getattr(const struct path *, struct kstat *, u32, unsigned int); -extern int cifs_setattr(struct dentry *, struct iattr *); +extern int cifs_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); +extern int cifs_setattr(struct user_namespace *, struct dentry *, + struct iattr *); extern int cifs_fiemap(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); @@ -132,8 +137,8 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path); /* Functions related to symlinks */ extern const char *cifs_get_link(struct dentry *, struct inode *, struct delayed_call *); -extern int cifs_symlink(struct inode *inode, struct dentry *direntry, - const char *symname); +extern int cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, + struct dentry *direntry, const char *symname); #ifdef CONFIG_CIFS_XATTR extern const struct xattr_handler *cifs_xattr_handlers[]; @@ -160,5 +165,5 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.30" +#define CIFS_VERSION "2.31" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 50fcb65920e8..3de3c5908a72 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -21,6 +21,7 @@ #include <linux/in.h> #include <linux/in6.h> +#include <linux/inet.h> #include <linux/slab.h> #include <linux/mempool.h> #include <linux/workqueue.h> @@ -504,6 +505,8 @@ struct smb_version_operations { loff_t (*llseek)(struct file *, struct cifs_tcon *, loff_t, int); /* Check for STATUS_IO_TIMEOUT */ bool (*is_status_io_timeout)(char *buf); + /* Check for STATUS_NETWORK_NAME_DELETED */ + void (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); }; struct smb_version_values { @@ -577,6 +580,7 @@ inc_rfc1001_len(void *buf, int count) struct TCP_Server_Info { struct list_head tcp_ses_list; struct list_head smb_ses_list; + __u64 conn_id; /* connection identifier (useful for debugging) */ int srv_count; /* reference counter */ /* 15 character server name + 0x20 16th byte indicating type = srv */ char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; @@ -901,7 +905,7 @@ struct cifs_ses { kuid_t linux_uid; /* overriding owner of files on the mount */ kuid_t cred_uid; /* owner of credentials */ unsigned int capabilities; - char serverName[SERVER_NAME_LEN_WITH_NULL]; + char ip_addr[INET6_ADDRSTRLEN + 1]; /* Max ipv6 (or v4) addr string len */ char *user_name; /* must not be null except during init of sess and after mount option parsing we fill it */ char *domainName; @@ -1704,7 +1708,9 @@ static inline bool is_retryable_error(int error) #define CIFS_ECHO_OP 0x080 /* echo request */ #define CIFS_OBREAK_OP 0x0100 /* oplock break request */ #define CIFS_NEG_OP 0x0200 /* negotiate request */ -#define CIFS_OP_MASK 0x0380 /* mask request type */ +/* Lower bitmask values are reserved by others below. */ +#define CIFS_SESS_OP 0x2000 /* session setup request */ +#define CIFS_OP_MASK 0x2380 /* mask request type */ #define CIFS_HAS_CREDITS 0x0400 /* already has credits */ #define CIFS_TRANSFORM_REQ 0x0800 /* transform request before sending */ @@ -1844,6 +1850,7 @@ GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ */ GLOBAL_EXTERN atomic_t sesInfoAllocCount; GLOBAL_EXTERN atomic_t tconInfoAllocCount; +GLOBAL_EXTERN atomic_t tcpSesNextId; GLOBAL_EXTERN atomic_t tcpSesAllocCount; GLOBAL_EXTERN atomic_t tcpSesReconnectCount; GLOBAL_EXTERN atomic_t tconInfoReconnectCount; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 32f7a013402e..75ce6f742b8d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -232,6 +232,8 @@ extern unsigned int setup_special_user_owner_ACE(struct cifs_ace *pace); extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); +extern ssize_t cifs_discard_from_socket(struct TCP_Server_Info *server, + size_t to_read); extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 0496934feecb..c279527aae92 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1451,9 +1451,9 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server) while (remaining > 0) { int length; - length = cifs_read_from_socket(server, server->bigbuf, - min_t(unsigned int, remaining, - CIFSMaxBufSize + MAX_HEADER_SIZE(server))); + length = cifs_discard_from_socket(server, + min_t(size_t, remaining, + CIFSMaxBufSize + MAX_HEADER_SIZE(server))); if (length < 0) return length; server->total_read += length; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4bb9decbbf27..112692300fb6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -242,7 +242,7 @@ cifs_reconnect(struct TCP_Server_Info *server) server->max_read = 0; cifs_dbg(FYI, "Mark tcp session as need reconnect\n"); - trace_smb3_reconnect(server->CurrentMid, server->hostname); + trace_smb3_reconnect(server->CurrentMid, server->conn_id, server->hostname); /* before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they are not used until reconnected */ @@ -564,6 +564,23 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, return cifs_readv_from_socket(server, &smb_msg); } +ssize_t +cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) +{ + struct msghdr smb_msg; + + /* + * iov_iter_discard already sets smb_msg.type and count and iov_offset + * and cifs_readv_from_socket sets msg_control and msg_controllen + * so little to initialize in struct msghdr + */ + smb_msg.msg_name = NULL; + smb_msg.msg_namelen = 0; + iov_iter_discard(&smb_msg.msg_iter, READ, to_read); + + return cifs_readv_from_socket(server, &smb_msg); +} + int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, unsigned int to_read) @@ -846,7 +863,7 @@ static void smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; - int scredits = server->credits; + int scredits, in_flight; /* * SMB1 does not use credits. @@ -857,12 +874,14 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) if (shdr->CreditRequest) { spin_lock(&server->req_lock); server->credits += le16_to_cpu(shdr->CreditRequest); + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, - le16_to_cpu(shdr->CreditRequest)); + server->conn_id, server->hostname, scredits, + le16_to_cpu(shdr->CreditRequest), in_flight); cifs_server_dbg(FYI, "%s: added %u credits total=%d\n", __func__, le16_to_cpu(shdr->CreditRequest), scredits); @@ -993,6 +1012,10 @@ next_pdu: if (mids[i] != NULL) { mids[i]->resp_buf_size = server->pdu_size; + if (bufs[i] && server->ops->is_network_name_deleted) + server->ops->is_network_name_deleted(bufs[i], + server); + if (!mids[i]->multiRsp || mids[i]->multiEnd) mids[i]->callback(mids[i]); @@ -1317,6 +1340,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx) goto out_err_crypto_release; } + tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); tcp_ses->noblockcnt = ctx->rootfs; tcp_ses->noblocksnd = ctx->noblocksnd || ctx->rootfs; tcp_ses->noautotune = ctx->noautotune; @@ -1838,9 +1862,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) /* new SMB session uses our server ref */ ses->server = server; if (server->dstaddr.ss_family == AF_INET6) - sprintf(ses->serverName, "%pI6", &addr6->sin6_addr); + sprintf(ses->ip_addr, "%pI6", &addr6->sin6_addr); else - sprintf(ses->serverName, "%pI4", &addr->sin_addr); + sprintf(ses->ip_addr, "%pI4", &addr->sin_addr); if (ctx->username) { ses->user_name = kstrdup(ctx->username, GFP_KERNEL); @@ -2269,7 +2293,9 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) if (strcmp(old->local_nls->charset, new->local_nls->charset)) return 0; - if (old->ctx->actimeo != new->ctx->actimeo) + if (old->ctx->acregmax != new->ctx->acregmax) + return 0; + if (old->ctx->acdirmax != new->ctx->acdirmax) return 0; return 1; @@ -2911,7 +2937,7 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, #ifdef CONFIG_CIFS_DFS_UPCALL /* * cifs_build_path_to_root returns full path to root when we do not have an - * exiting connection (tcon) + * existing connection (tcon) */ static char * build_unc_path_to_root(const struct smb3_fs_context *ctx, @@ -3038,96 +3064,91 @@ static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it, return 0; } -static int setup_dfs_tgt_conn(const char *path, const char *full_path, - const struct dfs_cache_tgt_iterator *tgt_it, - struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, - unsigned int *xid, struct TCP_Server_Info **server, - struct cifs_ses **ses, struct cifs_tcon **tcon) -{ - int rc; - struct dfs_info3_param ref = {0}; - char *mdata = NULL; - struct smb3_fs_context fake_ctx = {NULL}; - char *fake_devname = NULL; - - cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path); - - rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref); - if (rc) - return rc; - - mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, - full_path + 1, &ref, - &fake_devname); - free_dfs_info_param(&ref); - - if (IS_ERR(mdata)) { - rc = PTR_ERR(mdata); - mdata = NULL; - } else - rc = cifs_setup_volume_info(&fake_ctx, mdata, fake_devname); - - kfree(mdata); - kfree(fake_devname); - - if (!rc) { - /* - * We use a 'fake_ctx' here because we need pass it down to the - * mount_{get,put} functions to test connection against new DFS - * targets. - */ - mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon); - rc = mount_get_conns(&fake_ctx, cifs_sb, xid, server, ses, - tcon); - if (!rc || (*server && *ses)) { - /* - * We were able to connect to new target server. - * Update current context with new target server. - */ - rc = update_vol_info(tgt_it, &fake_ctx, ctx); - } - } - smb3_cleanup_fs_context_contents(&fake_ctx); - return rc; -} - static int do_dfs_failover(const char *path, const char *full_path, struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, struct cifs_ses *root_ses, unsigned int *xid, struct TCP_Server_Info **server, struct cifs_ses **ses, struct cifs_tcon **tcon) { int rc; - struct dfs_cache_tgt_list tgt_list; + struct dfs_cache_tgt_list tgt_list = {0}; struct dfs_cache_tgt_iterator *tgt_it = NULL; + struct smb3_fs_context tmp_ctx = {NULL}; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) return -EOPNOTSUPP; + cifs_dbg(FYI, "%s: path=%s full_path=%s\n", __func__, path, full_path); + rc = dfs_cache_noreq_find(path, NULL, &tgt_list); if (rc) return rc; + /* + * We use a 'tmp_ctx' here because we need pass it down to the mount_{get,put} functions to + * test connection against new DFS targets. + */ + rc = smb3_fs_context_dup(&tmp_ctx, ctx); + if (rc) + goto out; for (;;) { + struct dfs_info3_param ref = {0}; + char *fake_devname = NULL, *mdata = NULL; + /* Get next DFS target server - if any */ rc = get_next_dfs_tgt(path, &tgt_list, &tgt_it); if (rc) break; - /* Connect to next DFS target */ - rc = setup_dfs_tgt_conn(path, full_path, tgt_it, cifs_sb, ctx, xid, server, ses, - tcon); - if (!rc || (*server && *ses)) + + rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref); + if (rc) + break; + + cifs_dbg(FYI, "%s: old ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC, + tmp_ctx.prepath); + + mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, full_path + 1, &ref, + &fake_devname); + free_dfs_info_param(&ref); + + if (IS_ERR(mdata)) { + rc = PTR_ERR(mdata); + mdata = NULL; + } else + rc = cifs_setup_volume_info(&tmp_ctx, mdata, fake_devname); + + kfree(mdata); + kfree(fake_devname); + + if (rc) break; + + cifs_dbg(FYI, "%s: new ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC, + tmp_ctx.prepath); + + mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon); + rc = mount_get_conns(&tmp_ctx, cifs_sb, xid, server, ses, tcon); + if (!rc || (*server && *ses)) { + /* + * We were able to connect to new target server. Update current context with + * new target server. + */ + rc = update_vol_info(tgt_it, &tmp_ctx, ctx); + break; + } } if (!rc) { + cifs_dbg(FYI, "%s: final ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC, + tmp_ctx.prepath); /* - * Update DFS target hint in DFS referral cache with the target - * server we successfully reconnected to. + * Update DFS target hint in DFS referral cache with the target server we + * successfully reconnected to. */ - rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses, - cifs_sb->local_nls, - cifs_remap(cifs_sb), path, - tgt_it); + rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses, cifs_sb->local_nls, + cifs_remap(cifs_sb), path, tgt_it); } + +out: + smb3_cleanup_fs_context_contents(&tmp_ctx); dfs_cache_free_tgts(&tgt_list); return rc; } @@ -3285,77 +3306,77 @@ static void put_root_ses(struct cifs_ses *ses) cifs_put_smb_ses(ses); } -/* Check if a path component is remote and then update @dfs_path accordingly */ -static int check_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, - const unsigned int xid, struct TCP_Server_Info *server, - struct cifs_tcon *tcon, char **dfs_path) +/* Set up next dfs prefix path in @dfs_path */ +static int next_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, + const unsigned int xid, struct TCP_Server_Info *server, + struct cifs_tcon *tcon, char **dfs_path) { - char *path, *s; - char sep = CIFS_DIR_SEP(cifs_sb), tmp; - char *npath; - int rc = 0; - int added_treename = tcon->Flags & SMB_SHARE_IS_IN_DFS; - int skip = added_treename; + char *path, *npath; + int added_treename = is_tcon_dfs(tcon); + int rc; path = cifs_build_path_to_root(ctx, cifs_sb, tcon, added_treename); if (!path) return -ENOMEM; - /* - * Walk through the path components in @path and check if they're accessible. In case any of - * the components is -EREMOTE, then update @dfs_path with the next DFS referral request path - * (NOT including the remaining components). - */ - s = path; - do { - /* skip separators */ - while (*s && *s == sep) - s++; - if (!*s) - break; - /* next separator */ - while (*s && *s != sep) - s++; - /* - * if the treename is added, we then have to skip the first - * part within the separators - */ - if (skip) { - skip = 0; - continue; + rc = is_path_remote(cifs_sb, ctx, xid, server, tcon); + if (rc == -EREMOTE) { + struct smb3_fs_context v = {NULL}; + /* if @path contains a tree name, skip it in the prefix path */ + if (added_treename) { + rc = smb3_parse_devname(path, &v); + if (rc) + goto out; + npath = build_unc_path_to_root(&v, cifs_sb, true); + smb3_cleanup_fs_context_contents(&v); + } else { + v.UNC = ctx->UNC; + v.prepath = path + 1; + npath = build_unc_path_to_root(&v, cifs_sb, true); } - tmp = *s; - *s = 0; - rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, path); - if (rc && rc == -EREMOTE) { - struct smb3_fs_context v = {NULL}; - /* if @path contains a tree name, skip it in the prefix path */ - if (added_treename) { - rc = smb3_parse_devname(path, &v); - if (rc) - break; - rc = -EREMOTE; - npath = build_unc_path_to_root(&v, cifs_sb, true); - smb3_cleanup_fs_context_contents(&v); - } else { - v.UNC = ctx->UNC; - v.prepath = path + 1; - npath = build_unc_path_to_root(&v, cifs_sb, true); - } - if (IS_ERR(npath)) { - rc = PTR_ERR(npath); - break; - } - kfree(*dfs_path); - *dfs_path = npath; + + if (IS_ERR(npath)) { + rc = PTR_ERR(npath); + goto out; } - *s = tmp; - } while (rc == 0); + kfree(*dfs_path); + *dfs_path = npath; + rc = -EREMOTE; + } + +out: kfree(path); return rc; } +/* Check if resolved targets can handle any DFS referrals */ +static int is_referral_server(const char *ref_path, struct cifs_tcon *tcon, bool *ref_server) +{ + int rc; + struct dfs_info3_param ref = {0}; + + if (is_tcon_dfs(tcon)) { + *ref_server = true; + } else { + cifs_dbg(FYI, "%s: ref_path=%s\n", __func__, ref_path); + + rc = dfs_cache_noreq_find(ref_path, &ref, NULL); + if (rc) { + cifs_dbg(VFS, "%s: dfs_cache_noreq_find: failed (rc=%d)\n", __func__, rc); + return rc; + } + cifs_dbg(FYI, "%s: ref.flags=0x%x\n", __func__, ref.flags); + /* + * Check if all targets are capable of handling DFS referrals as per + * MS-DFSC 2.2.4 RESP_GET_DFS_REFERRAL. + */ + *ref_server = !!(ref.flags & DFSREF_REFERRAL_SERVER); + free_dfs_info_param(&ref); + } + return 0; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) { int rc = 0; @@ -3367,18 +3388,19 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) char *ref_path = NULL, *full_path = NULL; char *oldmnt = NULL; char *mntdata = NULL; + bool ref_server = false; rc = mount_get_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon); /* - * Unconditionally try to get an DFS referral (even cached) to determine whether it is an - * DFS mount. + * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally + * try to get an DFS referral (even cached) to determine whether it is an DFS mount. * * Skip prefix path to provide support for DFS referrals from w2k8 servers which don't seem * to respond with PATH_NOT_COVERED to requests that include the prefix. */ - if (dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), ctx->UNC + 1, NULL, + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) || + dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), ctx->UNC + 1, NULL, NULL)) { - /* No DFS referral was returned. Looks like a regular share. */ if (rc) goto error; /* Check if it is fully accessible and then mount it */ @@ -3432,13 +3454,18 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) break; if (!tcon) continue; + /* Make sure that requests go through new root servers */ - if (is_tcon_dfs(tcon)) { + rc = is_referral_server(ref_path + 1, tcon, &ref_server); + if (rc) + break; + if (ref_server) { put_root_ses(root_ses); set_root_ses(cifs_sb, ses, &root_ses); } - /* Check for remaining path components and then continue chasing them (-EREMOTE) */ - rc = check_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path); + + /* Get next dfs path and then continue chasing them if -EREMOTE */ + rc = next_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path); /* Prevent recursion on broken link referrals */ if (rc == -EREMOTE && ++count > MAX_NESTED_LINKS) rc = -ELOOP; diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 4950ab0486ae..098b4bc8da59 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -37,11 +37,12 @@ struct cache_dfs_tgt { struct cache_entry { struct hlist_node hlist; const char *path; - int ttl; - int srvtype; - int flags; + int hdr_flags; /* RESP_GET_DFS_REFERRAL.ReferralHeaderFlags */ + int ttl; /* DFS_REREFERRAL_V3.TimeToLive */ + int srvtype; /* DFS_REREFERRAL_V3.ServerType */ + int ref_flags; /* DFS_REREFERRAL_V3.ReferralEntryFlags */ struct timespec64 etime; - int path_consumed; + int path_consumed; /* RESP_GET_DFS_REFERRAL.PathConsumed */ int numtgts; struct list_head tlist; struct cache_dfs_tgt *tgthint; @@ -166,14 +167,11 @@ static int dfscache_proc_show(struct seq_file *m, void *v) continue; seq_printf(m, - "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," - "interlink=%s,path_consumed=%d,expired=%s\n", - ce->path, - ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", - ce->ttl, ce->etime.tv_nsec, - IS_INTERLINK_SET(ce->flags) ? "yes" : "no", - ce->path_consumed, - cache_entry_expired(ce) ? "yes" : "no"); + "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n", + ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", + ce->ttl, ce->etime.tv_nsec, ce->ref_flags, ce->hdr_flags, + IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no", + ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); list_for_each_entry(t, &ce->tlist, list) { seq_printf(m, " %s%s\n", @@ -236,11 +234,12 @@ static inline void dump_tgts(const struct cache_entry *ce) static inline void dump_ce(const struct cache_entry *ce) { - cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,interlink=%s,path_consumed=%d,expired=%s\n", + cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n", ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, - IS_INTERLINK_SET(ce->flags) ? "yes" : "no", + ce->hdr_flags, ce->ref_flags, + IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no", ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); dump_tgts(ce); @@ -381,7 +380,8 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, ce->ttl = refs[0].ttl; ce->etime = get_expire_time(ce->ttl); ce->srvtype = refs[0].server_type; - ce->flags = refs[0].ref_flag; + ce->hdr_flags = refs[0].flags; + ce->ref_flags = refs[0].ref_flag; ce->path_consumed = refs[0].path_consumed; for (i = 0; i < numrefs; i++) { @@ -799,7 +799,8 @@ static int setup_referral(const char *path, struct cache_entry *ce, ref->path_consumed = ce->path_consumed; ref->ttl = ce->ttl; ref->server_type = ce->srvtype; - ref->ref_flag = ce->flags; + ref->ref_flag = ce->ref_flags; + ref->flags = ce->hdr_flags; return 0; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 97ac363b5df1..a3fb81e0ba17 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -567,8 +567,8 @@ out_free_xid: return rc; } -int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, - bool excl) +int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, + struct dentry *direntry, umode_t mode, bool excl) { int rc; unsigned int xid = get_xid(); @@ -611,8 +611,8 @@ out_free_xid: return rc; } -int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, - dev_t device_number) +int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, + struct dentry *direntry, umode_t mode, dev_t device_number) { int rc = -EPERM; unsigned int xid; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6d001905c8e5..26de4329d161 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -580,7 +580,7 @@ int cifs_open(struct inode *inode, struct file *file) } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { if (tcon->ses->serverNOS) cifs_dbg(VFS, "server %s of type %s returned unexpected error on SMB posix open, disabling posix open support. Check if server update available.\n", - tcon->ses->serverName, + tcon->ses->ip_addr, tcon->ses->serverNOS); tcon->broken_posix_open = true; } else if ((rc != -EIO) && (rc != -EREMOTE) && diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 12a5da0230b5..892f51a21278 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -140,6 +140,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("rsize", Opt_rsize), fsparam_u32("wsize", Opt_wsize), fsparam_u32("actimeo", Opt_actimeo), + fsparam_u32("acdirmax", Opt_acdirmax), + fsparam_u32("acregmax", Opt_acregmax), fsparam_u32("echo_interval", Opt_echo_interval), fsparam_u32("max_credits", Opt_max_credits), fsparam_u32("handletimeout", Opt_handletimeout), @@ -397,7 +399,7 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) ctx->vals = &smb3any_values; break; case Smb_default: - ctx->ops = &smb30_operations; /* currently identical with 3.0 */ + ctx->ops = &smb30_operations; ctx->vals = &smbdefault_values; break; default: @@ -542,20 +544,37 @@ static int smb3_fs_context_parse_monolithic(struct fs_context *fc, /* BB Need to add support for sep= here TBD */ while ((key = strsep(&options, ",")) != NULL) { - if (*key) { - size_t v_len = 0; - char *value = strchr(key, '='); - - if (value) { - if (value == key) - continue; - *value++ = 0; - v_len = strlen(value); - } - ret = vfs_parse_fs_string(fc, key, value, v_len); - if (ret < 0) - break; + size_t len; + char *value; + + if (*key == 0) + break; + + /* Check if following character is the deliminator If yes, + * we have encountered a double deliminator reset the NULL + * character to the deliminator + */ + while (options && options[0] == ',') { + len = strlen(key); + strcpy(key + len, options); + options = strchr(options, ','); + if (options) + *options++ = 0; + } + + + len = 0; + value = strchr(key, '='); + if (value) { + if (value == key) + continue; + *value++ = 0; + len = strlen(value); } + + ret = vfs_parse_fs_string(fc, key, value, len); + if (ret < 0) + break; } return ret; @@ -929,12 +948,31 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->wsize = result.uint_32; ctx->got_wsize = true; break; + case Opt_acregmax: + ctx->acregmax = HZ * result.uint_32; + if (ctx->acregmax > CIFS_MAX_ACTIMEO) { + cifs_dbg(VFS, "acregmax too large\n"); + goto cifs_parse_mount_err; + } + break; + case Opt_acdirmax: + ctx->acdirmax = HZ * result.uint_32; + if (ctx->acdirmax > CIFS_MAX_ACTIMEO) { + cifs_dbg(VFS, "acdirmax too large\n"); + goto cifs_parse_mount_err; + } + break; case Opt_actimeo: - ctx->actimeo = HZ * result.uint_32; - if (ctx->actimeo > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "attribute cache timeout too large\n"); + if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) { + cifs_dbg(VFS, "timeout too large\n"); goto cifs_parse_mount_err; } + if ((ctx->acdirmax != CIFS_DEF_ACTIMEO) || + (ctx->acregmax != CIFS_DEF_ACTIMEO)) { + cifs_dbg(VFS, "actimeo ignored since acregmax or acdirmax specified\n"); + break; + } + ctx->acdirmax = ctx->acregmax = HZ * result.uint_32; break; case Opt_echo_interval: ctx->echo_interval = result.uint_32; @@ -1361,7 +1399,8 @@ int smb3_init_fs_context(struct fs_context *fc) /* default is to use strict cifs caching semantics */ ctx->strict_io = true; - ctx->actimeo = CIFS_DEF_ACTIMEO; + ctx->acregmax = CIFS_DEF_ACTIMEO; + ctx->acdirmax = CIFS_DEF_ACTIMEO; /* Most clients set timeout to 0, allows server to use its default */ ctx->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */ diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index 1c44a460e2c0..87dd1f7168f2 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -118,6 +118,8 @@ enum cifs_param { Opt_rsize, Opt_wsize, Opt_actimeo, + Opt_acdirmax, + Opt_acregmax, Opt_echo_interval, Opt_max_credits, Opt_snapshot, @@ -232,7 +234,9 @@ struct smb3_fs_context { unsigned int wsize; unsigned int min_offload; bool sockopt_tcp_nodelay:1; - unsigned long actimeo; /* attribute cache timeout (jiffies) */ + /* attribute cache timemout for files and directories in jiffies */ + unsigned long acregmax; + unsigned long acdirmax; struct smb_version_operations *ops; struct smb_version_values *vals; char *prepath; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a83b3a8ffaac..7c61bc9573c0 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1857,7 +1857,8 @@ posix_mkdir_get_info: goto posix_mkdir_out; } -int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) +int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, + struct dentry *direntry, umode_t mode) { int rc = 0; unsigned int xid; @@ -2067,9 +2068,9 @@ do_rename_exit: } int -cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, - struct inode *target_dir, struct dentry *target_dentry, - unsigned int flags) +cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, + struct dentry *source_dentry, struct inode *target_dir, + struct dentry *target_dentry, unsigned int flags) { char *from_name = NULL; char *to_name = NULL; @@ -2198,12 +2199,23 @@ cifs_inode_needs_reval(struct inode *inode) if (!lookupCacheEnabled) return true; - if (!cifs_sb->ctx->actimeo) - return true; - - if (!time_in_range(jiffies, cifs_i->time, - cifs_i->time + cifs_sb->ctx->actimeo)) - return true; + /* + * depending on inode type, check if attribute caching disabled for + * files or directories + */ + if (S_ISDIR(inode->i_mode)) { + if (!cifs_sb->ctx->acdirmax) + return true; + if (!time_in_range(jiffies, cifs_i->time, + cifs_i->time + cifs_sb->ctx->acdirmax)) + return true; + } else { /* file */ + if (!cifs_sb->ctx->acregmax) + return true; + if (!time_in_range(jiffies, cifs_i->time, + cifs_i->time + cifs_sb->ctx->acregmax)) + return true; + } /* hardlinked files w/ noserverino get "special" treatment */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && @@ -2370,8 +2382,8 @@ int cifs_revalidate_dentry(struct dentry *dentry) return cifs_revalidate_mapping(inode); } -int cifs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); @@ -2408,7 +2420,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat, return rc; } - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->blksize = cifs_sb->ctx->bsize; stat->ino = CIFS_I(inode)->uniqueid; @@ -2610,7 +2622,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = setattr_prepare(direntry, attrs); + rc = setattr_prepare(&init_user_ns, direntry, attrs); if (rc < 0) goto out; @@ -2715,7 +2727,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) attrs->ia_size != i_size_read(inode)) truncate_setsize(inode, attrs->ia_size); - setattr_copy(inode, attrs); + setattr_copy(&init_user_ns, inode, attrs); mark_inode_dirty(inode); /* force revalidate when any of these times are set since some @@ -2757,7 +2769,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = setattr_prepare(direntry, attrs); + rc = setattr_prepare(&init_user_ns, direntry, attrs); if (rc < 0) { free_xid(xid); return rc; @@ -2913,7 +2925,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) attrs->ia_size != i_size_read(inode)) truncate_setsize(inode, attrs->ia_size); - setattr_copy(inode, attrs); + setattr_copy(&init_user_ns, inode, attrs); mark_inode_dirty(inode); cifs_setattr_exit: @@ -2923,7 +2935,8 @@ cifs_setattr_exit: } int -cifs_setattr(struct dentry *direntry, struct iattr *attrs) +cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, + struct iattr *attrs) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 94dab4309fbb..7c5878a645d9 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -661,7 +661,8 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, } int -cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) +cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, + struct dentry *direntry, const char *symname) { int rc = -EOPNOTSUPP; unsigned int xid; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 213465718fa8..183a3a868d7b 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -218,7 +218,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, /* UNC and paths */ /* XXX: Use ses->server->hostname? */ - sprintf(unc, unc_fmt, ses->serverName); + sprintf(unc, unc_fmt, ses->ip_addr); ctx.UNC = unc; ctx.prepath = ""; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f19274857292..f5087295424c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -63,17 +63,19 @@ smb2_add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits, const int optype) { int *val, rc = -1; + int scredits, in_flight; unsigned int add = credits->value; unsigned int instance = credits->instance; bool reconnect_detected = false; + bool reconnect_with_invalid_credits = false; spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); /* eg found case where write overlapping reconnect messed up credits */ if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0)) - trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, - server->hostname, *val, add); + reconnect_with_invalid_credits = true; + if ((instance == 0) || (instance == server->reconnect_instance)) *val += add; else @@ -84,7 +86,9 @@ smb2_add_credits(struct TCP_Server_Info *server, pr_warn_once("server overflowed SMB3 credits\n"); } server->in_flight--; - if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP) + if (server->in_flight == 0 && + ((optype & CIFS_OP_MASK) != CIFS_NEG_OP) && + ((optype & CIFS_OP_MASK) != CIFS_SESS_OP)) rc = change_conf(server); /* * Sometimes server returns 0 credits on oplock break ack - we need to @@ -97,14 +101,26 @@ smb2_add_credits(struct TCP_Server_Info *server, server->oplock_credits++; } } + scredits = *val; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); if (reconnect_detected) { + trace_smb3_reconnect_detected(server->CurrentMid, + server->conn_id, server->hostname, scredits, add, in_flight); + cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n", add, instance); } + if (reconnect_with_invalid_credits) { + trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, + server->conn_id, server->hostname, scredits, add, in_flight); + cifs_dbg(FYI, "Negotiate operation when server credits is non-zero. Optype: %d, server credits: %d, credits added: %d\n", + optype, scredits, add); + } + if (server->tcpStatus == CifsNeedReconnect || server->tcpStatus == CifsExiting) return; @@ -123,23 +139,30 @@ smb2_add_credits(struct TCP_Server_Info *server, cifs_dbg(FYI, "disabling oplocks\n"); break; default: - trace_smb3_add_credits(server->CurrentMid, - server->hostname, rc, add); - cifs_dbg(FYI, "%s: added %u credits total=%d\n", __func__, add, rc); + /* change_conf rebalanced credits for different types */ + break; } + + trace_smb3_add_credits(server->CurrentMid, + server->conn_id, server->hostname, scredits, add, in_flight); + cifs_dbg(FYI, "%s: added %u credits total=%d\n", __func__, add, scredits); } static void smb2_set_credits(struct TCP_Server_Info *server, const int val) { + int scredits, in_flight; + spin_lock(&server->req_lock); server->credits = val; if (val == 1) server->reconnect_instance++; + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); trace_smb3_set_credits(server->CurrentMid, - server->hostname, val, val); + server->conn_id, server->hostname, scredits, val, in_flight); cifs_dbg(FYI, "%s: set %u credits\n", __func__, val); /* don't log while holding the lock */ @@ -171,7 +194,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, unsigned int *num, struct cifs_credits *credits) { int rc = 0; - unsigned int scredits; + unsigned int scredits, in_flight; spin_lock(&server->req_lock); while (1) { @@ -208,17 +231,18 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); credits->instance = server->reconnect_instance; server->credits -= credits->value; - scredits = server->credits; server->in_flight++; if (server->in_flight > server->max_in_flight) server->max_in_flight = server->in_flight; break; } } + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, -(credits->value)); + server->conn_id, server->hostname, scredits, -(credits->value), in_flight); cifs_dbg(FYI, "%s: removed %u credits total=%d\n", __func__, credits->value, scredits); @@ -231,14 +255,14 @@ smb2_adjust_credits(struct TCP_Server_Info *server, const unsigned int payload_size) { int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE); - int scredits; + int scredits, in_flight; if (!credits->value || credits->value == new_val) return 0; if (credits->value < new_val) { trace_smb3_too_many_credits(server->CurrentMid, - server->hostname, 0, credits->value - new_val); + server->conn_id, server->hostname, 0, credits->value - new_val, 0); cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)", credits->value, new_val); @@ -248,9 +272,13 @@ smb2_adjust_credits(struct TCP_Server_Info *server, spin_lock(&server->req_lock); if (server->reconnect_instance != credits->instance) { + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); + trace_smb3_reconnect_detected(server->CurrentMid, - server->hostname, 0, 0); + server->conn_id, server->hostname, scredits, + credits->value - new_val, in_flight); cifs_server_dbg(VFS, "trying to return %d credits to old session\n", credits->value - new_val); return -EAGAIN; @@ -258,15 +286,18 @@ smb2_adjust_credits(struct TCP_Server_Info *server, server->credits += credits->value - new_val; scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); - credits->value = new_val; trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, credits->value - new_val); + server->conn_id, server->hostname, scredits, + credits->value - new_val, in_flight); cifs_dbg(FYI, "%s: adjust added %u credits total=%d\n", __func__, credits->value - new_val, scredits); + credits->value = new_val; + return 0; } @@ -2369,7 +2400,7 @@ static bool smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; - int scredits; + int scredits, in_flight; if (shdr->Status != STATUS_PENDING) return false; @@ -2378,11 +2409,13 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) spin_lock(&server->req_lock); server->credits += le16_to_cpu(shdr->CreditRequest); scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, le16_to_cpu(shdr->CreditRequest)); + server->conn_id, server->hostname, scredits, + le16_to_cpu(shdr->CreditRequest), in_flight); cifs_dbg(FYI, "%s: status pending add %u credits total=%d\n", __func__, le16_to_cpu(shdr->CreditRequest), scredits); } @@ -2418,6 +2451,34 @@ smb2_is_status_io_timeout(char *buf) return false; } +static void +smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) +{ + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct list_head *tmp, *tmp1; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + if (shdr->Status != STATUS_NETWORK_NAME_DELETED) + return; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { + tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + if (tcon->tid == shdr->TreeId) { + tcon->need_reconnect = true; + spin_unlock(&cifs_tcp_ses_lock); + pr_warn_once("Server share %s deleted.\n", + tcon->treeName); + return; + } + } + } + spin_unlock(&cifs_tcp_ses_lock); +} + static int smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, struct cifsInodeInfo *cinode) @@ -4605,6 +4666,10 @@ static void smb2_decrypt_offload(struct work_struct *work) #ifdef CONFIG_CIFS_STATS2 mid->when_received = jiffies; #endif + if (dw->server->ops->is_network_name_deleted) + dw->server->ops->is_network_name_deleted(dw->buf, + dw->server); + mid->callback(mid); } else { spin_lock(&GlobalMid_Lock); @@ -4723,6 +4788,12 @@ non_offloaded_decrypt: rc = handle_read_data(server, *mid, buf, server->vals->read_rsp_size, pages, npages, len, false); + if (rc >= 0) { + if (server->ops->is_network_name_deleted) { + server->ops->is_network_name_deleted(buf, + server); + } + } } free_pages: @@ -5072,6 +5143,7 @@ struct smb_version_operations smb20_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_operations smb21_operations = { @@ -5173,6 +5245,7 @@ struct smb_version_operations smb21_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_operations smb30_operations = { @@ -5286,6 +5359,7 @@ struct smb_version_operations smb30_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_operations smb311_operations = { @@ -5399,6 +5473,7 @@ struct smb_version_operations smb311_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 794fc3b68b4f..4bbb6126b14d 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -814,8 +814,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) SMB3ANY_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); - req->DialectCount = cpu_to_le16(2); - total_len += 4; + req->Dialects[2] = cpu_to_le16(SMB311_PROT_ID); + req->DialectCount = cpu_to_le16(3); + total_len += 6; } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); @@ -849,6 +850,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) SMB2_CLIENT_GUID_SIZE); if ((server->vals->protocol_id == SMB311_PROT_ID) || (strcmp(server->vals->version_string, + SMB3ANY_VERSION_STRING) == 0) || + (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0)) assemble_neg_contexts(req, server, &total_len); } @@ -883,6 +886,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) cifs_server_dbg(VFS, "SMB2.1 dialect returned but not requested\n"); return -EIO; + } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { + /* ops set to 3.0 by default for default so update */ + server->ops = &smb311_operations; + server->vals = &smb311_values; } } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { @@ -1042,10 +1049,11 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) SMB3ANY_VERSION_STRING) == 0) { pneg_inbuf->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); pneg_inbuf->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); - pneg_inbuf->DialectCount = cpu_to_le16(2); - /* structure is big enough for 3 dialects, sending only 2 */ + pneg_inbuf->Dialects[2] = cpu_to_le16(SMB311_PROT_ID); + pneg_inbuf->DialectCount = cpu_to_le16(3); + /* SMB 2.1 not included so subtract one dialect from len */ inbuflen = sizeof(*pneg_inbuf) - - (2 * sizeof(pneg_inbuf->Dialects[0])); + (sizeof(pneg_inbuf->Dialects[0])); } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); @@ -1053,7 +1061,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) pneg_inbuf->Dialects[2] = cpu_to_le16(SMB302_PROT_ID); pneg_inbuf->Dialects[3] = cpu_to_le16(SMB311_PROT_ID); pneg_inbuf->DialectCount = cpu_to_le16(4); - /* structure is big enough for 3 dialects */ + /* structure is big enough for 4 dialects */ inbuflen = sizeof(*pneg_inbuf); } else { /* otherwise specific dialect was requested */ @@ -1253,7 +1261,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) cifs_ses_server(sess_data->ses), &rqst, &sess_data->buf0_type, - CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); + CIFS_LOG_ERROR | CIFS_SESS_OP, &rsp_iov); cifs_small_buf_release(sess_data->iov[0].iov_base); memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec)); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index c3d1a584f251..d6df908dccad 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -851,17 +851,21 @@ DEFINE_SMB3_LEASE_ERR_EVENT(lease_err); DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_PROTO(__u64 currmid, + __u64 conn_id, char *hostname), - TP_ARGS(currmid, hostname), + TP_ARGS(currmid, conn_id, hostname), TP_STRUCT__entry( __field(__u64, currmid) + __field(__u64, conn_id) __field(char *, hostname) ), TP_fast_assign( __entry->currmid = currmid; + __entry->conn_id = conn_id; __entry->hostname = hostname; ), - TP_printk("server=%s current_mid=0x%llx", + TP_printk("conn_id=0x%llx server=%s current_mid=%llu", + __entry->conn_id, __entry->hostname, __entry->currmid) ) @@ -869,44 +873,56 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class, #define DEFINE_SMB3_RECONNECT_EVENT(name) \ DEFINE_EVENT(smb3_reconnect_class, smb3_##name, \ TP_PROTO(__u64 currmid, \ - char *hostname), \ - TP_ARGS(currmid, hostname)) + __u64 conn_id, \ + char *hostname), \ + TP_ARGS(currmid, conn_id, hostname)) DEFINE_SMB3_RECONNECT_EVENT(reconnect); DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect); DECLARE_EVENT_CLASS(smb3_credit_class, TP_PROTO(__u64 currmid, + __u64 conn_id, char *hostname, int credits, - int credits_to_add), - TP_ARGS(currmid, hostname, credits, credits_to_add), + int credits_to_add, + int in_flight), + TP_ARGS(currmid, conn_id, hostname, credits, credits_to_add, in_flight), TP_STRUCT__entry( __field(__u64, currmid) + __field(__u64, conn_id) __field(char *, hostname) __field(int, credits) __field(int, credits_to_add) + __field(int, in_flight) ), TP_fast_assign( __entry->currmid = currmid; + __entry->conn_id = conn_id; __entry->hostname = hostname; __entry->credits = credits; __entry->credits_to_add = credits_to_add; + __entry->in_flight = in_flight; ), - TP_printk("server=%s current_mid=0x%llx credits=%d credits_to_add=%d", + TP_printk("conn_id=0x%llx server=%s current_mid=%llu " + "credits=%d credit_change=%d in_flight=%d", + __entry->conn_id, __entry->hostname, __entry->currmid, __entry->credits, - __entry->credits_to_add) + __entry->credits_to_add, + __entry->in_flight) ) #define DEFINE_SMB3_CREDIT_EVENT(name) \ DEFINE_EVENT(smb3_credit_class, smb3_##name, \ TP_PROTO(__u64 currmid, \ + __u64 conn_id, \ char *hostname, \ int credits, \ - int credits_to_add), \ - TP_ARGS(currmid, hostname, credits, credits_to_add)) + int credits_to_add, \ + int in_flight), \ + TP_ARGS(currmid, conn_id, hostname, credits, credits_to_add, in_flight)) DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); DEFINE_SMB3_CREDIT_EVENT(reconnect_detected); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 4a2b836eb017..e90a1d1380b0 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -445,7 +445,7 @@ unmask: */ server->tcpStatus = CifsNeedReconnect; trace_smb3_partial_send_reconnect(server->CurrentMid, - server->hostname); + server->conn_id, server->hostname); } smbd_done: if (rc < 0 && rc != -EINTR) @@ -527,7 +527,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, int *credits; int optype; long int t; - int scredits = server->credits; + int scredits, in_flight; if (timeout < 0) t = MAX_JIFFY_OFFSET; @@ -551,23 +551,39 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, server->max_in_flight = server->in_flight; *credits -= 1; *instance = server->reconnect_instance; + scredits = *credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); + + trace_smb3_add_credits(server->CurrentMid, + server->conn_id, server->hostname, scredits, -1, in_flight); + cifs_dbg(FYI, "%s: remove %u credits total=%d\n", + __func__, 1, scredits); + return 0; } while (1) { if (*credits < num_credits) { + scredits = *credits; spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); rc = wait_event_killable_timeout(server->request_q, has_credits(server, credits, num_credits), t); cifs_num_waiters_dec(server); if (!rc) { + spin_lock(&server->req_lock); + scredits = *credits; + in_flight = server->in_flight; + spin_unlock(&server->req_lock); + trace_smb3_credit_timeout(server->CurrentMid, - server->hostname, num_credits, 0); + server->conn_id, server->hostname, scredits, + num_credits, in_flight); cifs_server_dbg(VFS, "wait timed out after %d ms\n", - timeout); - return -ENOTSUPP; + timeout); + return -EBUSY; } if (rc == -ERESTARTSYS) return -ERESTARTSYS; @@ -595,6 +611,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, server->in_flight > 2 * MAX_COMPOUND && *credits <= MAX_COMPOUND) { spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); rc = wait_event_killable_timeout( server->request_q, @@ -603,13 +620,18 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, t); cifs_num_waiters_dec(server); if (!rc) { + spin_lock(&server->req_lock); + scredits = *credits; + in_flight = server->in_flight; + spin_unlock(&server->req_lock); + trace_smb3_credit_timeout( - server->CurrentMid, - server->hostname, num_credits, - 0); + server->CurrentMid, + server->conn_id, server->hostname, + scredits, num_credits, in_flight); cifs_server_dbg(VFS, "wait timed out after %d ms\n", - timeout); - return -ENOTSUPP; + timeout); + return -EBUSY; } if (rc == -ERESTARTSYS) return -ERESTARTSYS; @@ -625,16 +647,18 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, /* update # of requests on the wire to server */ if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) { *credits -= num_credits; - scredits = *credits; server->in_flight += num_credits; if (server->in_flight > server->max_in_flight) server->max_in_flight = server->in_flight; *instance = server->reconnect_instance; } + scredits = *credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, -(num_credits)); + server->conn_id, server->hostname, scredits, + -(num_credits), in_flight); cifs_dbg(FYI, "%s: remove %u credits total=%d\n", __func__, num_credits, scredits); break; @@ -656,13 +680,13 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num, const int flags, unsigned int *instance) { int *credits; - int scredits, sin_flight; + int scredits, in_flight; credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK); spin_lock(&server->req_lock); scredits = *credits; - sin_flight = server->in_flight; + in_flight = server->in_flight; if (*credits < num) { /* @@ -684,10 +708,11 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num, if (server->in_flight == 0) { spin_unlock(&server->req_lock); trace_smb3_insufficient_credits(server->CurrentMid, - server->hostname, scredits, sin_flight); + server->conn_id, server->hostname, scredits, + num, in_flight); cifs_dbg(FYI, "%s: %d requests in flight, needed %d total=%d\n", - __func__, sin_flight, num, scredits); - return -ENOTSUPP; + __func__, in_flight, num, scredits); + return -EDEADLK; } } spin_unlock(&server->req_lock); @@ -1171,7 +1196,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) smb311_update_preauth_hash(ses, rqst[0].rq_iov, rqst[0].rq_nvec); @@ -1236,7 +1261,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { struct kvec iov = { .iov_base = resp_iov[0].iov_base, .iov_len = resp_iov[0].iov_len diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 6b658a1172ef..41a611e76bb7 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -101,6 +101,7 @@ static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon, } static int cifs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index d5ebd36fb2cc..e7b27754ce78 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -46,10 +46,12 @@ extern const struct file_operations coda_ioctl_operations; /* operations shared over more than one file */ int coda_open(struct inode *i, struct file *f); int coda_release(struct inode *i, struct file *f); -int coda_permission(struct inode *inode, int mask); +int coda_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask); int coda_revalidate_inode(struct inode *); -int coda_getattr(const struct path *, struct kstat *, u32, unsigned int); -int coda_setattr(struct dentry *, struct iattr *); +int coda_getattr(struct user_namespace *, const struct path *, struct kstat *, + u32, unsigned int); +int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *); /* this file: heloers */ char *coda_f2s(struct CodaFid *f); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index ca40c2556ba6..d69989c1bac3 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -73,7 +73,8 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig } -int coda_permission(struct inode *inode, int mask) +int coda_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { int error; @@ -132,7 +133,8 @@ static inline void coda_dir_drop_nlink(struct inode *dir) } /* creation routines: create, mknod, mkdir, link, symlink */ -static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, bool excl) +static int coda_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *de, umode_t mode, bool excl) { int error; const char *name=de->d_name.name; @@ -164,7 +166,8 @@ err_out: return error; } -static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode) +static int coda_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *de, umode_t mode) { struct inode *inode; struct coda_vattr attrs; @@ -225,7 +228,8 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode, } -static int coda_symlink(struct inode *dir_inode, struct dentry *de, +static int coda_symlink(struct user_namespace *mnt_userns, + struct inode *dir_inode, struct dentry *de, const char *symname) { const char *name = de->d_name.name; @@ -291,9 +295,9 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) } /* rename */ -static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { const char *old_name = old_dentry->d_name.name; const char *new_name = new_dentry->d_name.name; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index b1c70e2b9b1e..d9f1bd7153df 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -251,16 +251,17 @@ static void coda_evict_inode(struct inode *inode) coda_cache_clear_inode(inode); } -int coda_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int coda_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { int err = coda_revalidate_inode(d_inode(path->dentry)); if (!err) - generic_fillattr(d_inode(path->dentry), stat); + generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); return err; } -int coda_setattr(struct dentry *de, struct iattr *iattr) +int coda_setattr(struct user_namespace *mnt_userns, struct dentry *de, + struct iattr *iattr) { struct inode *inode = d_inode(de); struct coda_vattr vattr; diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 3aec27e5eb82..cb9fd59a688c 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -24,7 +24,8 @@ #include "coda_linux.h" /* pioctl ops */ -static int coda_ioctl_permission(struct inode *inode, int mask); +static int coda_ioctl_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); static long coda_pioctl(struct file *filp, unsigned int cmd, unsigned long user_data); @@ -40,7 +41,8 @@ const struct file_operations coda_ioctl_operations = { }; /* the coda pioctl inode ops */ -static int coda_ioctl_permission(struct inode *inode, int mask) +static int coda_ioctl_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { return (mask & MAY_EXEC) ? -EACCES : 0; } diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 22dce2d35a4b..9a3aed249692 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -79,7 +79,8 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); -extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr); +extern int configfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr); extern struct dentry *configfs_pin_fs(void); extern void configfs_release_fs(void); @@ -92,7 +93,8 @@ extern const struct inode_operations configfs_root_inode_operations; extern const struct inode_operations configfs_symlink_inode_operations; extern const struct dentry_operations configfs_dentry_ops; -extern int configfs_symlink(struct inode *dir, struct dentry *dentry, +extern int configfs_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, const char *symname); extern int configfs_unlink(struct inode *dir, struct dentry *dentry); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index b839dd1b459f..b6098e02e20b 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1268,7 +1268,8 @@ out_root_unlock: } EXPORT_SYMBOL(configfs_depend_item_unlocked); -static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int configfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int ret = 0; int module_got = 0; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 8bd6a883c94c..42c348bb2903 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -40,7 +40,8 @@ static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; -int configfs_setattr(struct dentry * dentry, struct iattr * iattr) +int configfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode * inode = d_inode(dentry); struct configfs_dirent * sd = dentry->d_fsdata; @@ -67,7 +68,7 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) } /* attributes were changed atleast once in past */ - error = simple_setattr(dentry, iattr); + error = simple_setattr(mnt_userns, dentry, iattr); if (error) return error; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index cb61467478ca..77c854364e60 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -139,7 +139,8 @@ static int get_target(const char *symname, struct path *path, } -int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +int configfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { int ret; struct path path; @@ -197,7 +198,8 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna if (dentry->d_inode || d_unhashed(dentry)) ret = -EEXIST; else - ret = inode_permission(dir, MAY_WRITE | MAY_EXEC); + ret = inode_permission(&init_user_ns, dir, + MAY_WRITE | MAY_EXEC); if (!ret) ret = type->ct_item_ops->allow_link(parent_item, target_item); if (!ret) { diff --git a/fs/coredump.c b/fs/coredump.c index a2f6ecc8e345..1c0fdc1aa70b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -703,6 +703,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) goto close_fail; } } else { + struct user_namespace *mnt_userns; struct inode *inode; int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | O_LARGEFILE | O_EXCL; @@ -780,13 +781,15 @@ void do_coredump(const kernel_siginfo_t *siginfo) * a process dumps core while its cwd is e.g. on a vfat * filesystem. */ - if (!uid_eq(inode->i_uid, current_fsuid())) + mnt_userns = file_mnt_user_ns(cprm.file); + if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), current_fsuid())) goto close_fail; if ((inode->i_mode & 0677) != 0600) goto close_fail; if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; - if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) + if (do_truncate(mnt_userns, cprm.file->f_path.dentry, + 0, 0, cprm.file)) goto close_fail; } @@ -894,10 +897,10 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, */ page = get_dump_page(addr); if (page) { - void *kaddr = kmap(page); + void *kaddr = kmap_local_page(page); stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); + kunmap_local(kaddr); put_page(page); } else { stop = !dump_skip(cprm, PAGE_SIZE); @@ -931,7 +934,8 @@ void dump_truncate(struct coredump_params *cprm) if (file->f_op->llseek && file->f_op->llseek != no_llseek) { offset = file->f_op->llseek(file, 0, SEEK_CUR); if (i_size_read(file->f_mapping->host) < offset) - do_truncate(file->f_path.dentry, offset, 0, file); + do_truncate(file_mnt_user_ns(file), file->f_path.dentry, + offset, 0, file); } } EXPORT_SYMBOL(dump_truncate); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index a51cef6bd27f..ed3d623724cd 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -465,7 +465,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) return -EFAULT; policy.version = version; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; ret = mnt_want_write_file(filp); diff --git a/fs/dcache.c b/fs/dcache.c index 799d9e4f0bcd..7d24ff7eb206 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2176,8 +2176,8 @@ EXPORT_SYMBOL(d_obtain_root); * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * - * For a case-insensitive lookup match and if the the case-exact dentry - * already exists in in the dcache, use it and return it. + * For a case-insensitive lookup match and if the case-exact dentry + * already exists in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 2fcf66473436..22e86ae4dd5a 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -42,13 +42,14 @@ static unsigned int debugfs_allow = DEFAULT_DEBUGFS_ALLOW_BITS; * so that we can use the file mode as part of a heuristic to determine whether * to lock down individual files. */ -static int debugfs_setattr(struct dentry *dentry, struct iattr *ia) +static int debugfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *ia) { int ret = security_locked_down(LOCKDOWN_DEBUGFS); if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) return ret; - return simple_setattr(dentry, ia); + return simple_setattr(&init_user_ns, dentry, ia); } static const struct inode_operations debugfs_file_inode_operations = { @@ -297,7 +298,7 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) { struct dentry *dentry; - if (IS_ERR(parent)) + if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent)) return NULL; if (!parent) @@ -318,6 +319,9 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return ERR_PTR(-EPERM); + if (!debugfs_initialized()) + return ERR_PTR(-ENOENT); + pr_debug("creating file '%s'\n", name); if (IS_ERR(parent)) @@ -775,8 +779,8 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, take_dentry_name_snapshot(&old_name, old_dentry); - error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir), - dentry, 0); + error = simple_rename(&init_user_ns, d_inode(old_dir), old_dentry, + d_inode(new_dir), dentry, 0); if (error) { release_dentry_name_snapshot(&old_name); goto exit; diff --git a/fs/direct-io.c b/fs/direct-io.c index aa1083ecd623..b61491bf3166 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -462,7 +462,7 @@ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) * Wait for the next BIO to complete. Remove it and return it. NULL is * returned once all BIOs have been completed. This must only be called once * all bios have been issued so that dio->refcount can only decrease. This - * requires that that the caller hold a reference on the dio. + * requires that the caller hold a reference on the dio. */ static struct bio *dio_await_one(struct dio *dio) { @@ -695,7 +695,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, if (ret) goto out; sector = start_sector << (sdio->blkbits - 9); - nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES); + nr_pages = bio_max_segs(sdio->pages_in_io); BUG_ON(nr_pages <= 0); dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); sdio->boundary = 0; @@ -1279,7 +1279,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, if (retval == -ENOTBLK) { /* * The remaining part of the request will be - * be handled by buffered I/O when we return + * handled by buffered I/O when we return */ retval = 0; } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 0681540c48d9..943e523f4c9d 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1110,8 +1110,8 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, } inode_lock(lower_inode); - rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, - page_virt, size, 0); + rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode, + ECRYPTFS_XATTR_NAME, page_virt, size, 0); if (!rc && ecryptfs_inode) fsstack_copy_attr_all(ecryptfs_inode, lower_inode); inode_unlock(lower_inode); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 58d0f7187997..18e9285fbb4c 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -141,7 +141,8 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, else if (d_unhashed(lower_dentry)) rc = -EINVAL; else - rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); + rc = vfs_unlink(&init_user_ns, lower_dir_inode, lower_dentry, + NULL); if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; @@ -180,7 +181,8 @@ ecryptfs_do_create(struct inode *directory_inode, lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true); + rc = vfs_create(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, + mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); @@ -190,7 +192,8 @@ ecryptfs_do_create(struct inode *directory_inode, inode = __ecryptfs_get_inode(d_inode(lower_dentry), directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL); + vfs_unlink(&init_user_ns, d_inode(lower_dir_dentry), + lower_dentry, NULL); goto out_lock; } fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry)); @@ -254,7 +257,8 @@ out: * Returns zero on success; non-zero on error condition */ static int -ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, +ecryptfs_create(struct user_namespace *mnt_userns, + struct inode *directory_inode, struct dentry *ecryptfs_dentry, umode_t mode, bool excl) { struct inode *ecryptfs_inode; @@ -436,8 +440,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, dget(lower_old_dentry); dget(lower_new_dentry); lower_dir_dentry = lock_parent(lower_new_dentry); - rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry), - lower_new_dentry, NULL); + rc = vfs_link(lower_old_dentry, &init_user_ns, + d_inode(lower_dir_dentry), lower_new_dentry, NULL); if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); @@ -460,7 +464,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) return ecryptfs_do_unlink(dir, dentry, d_inode(dentry)); } -static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, +static int ecryptfs_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, const char *symname) { int rc; @@ -481,7 +486,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, strlen(symname)); if (rc) goto out_lock; - rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry, + rc = vfs_symlink(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, encoded_symname); kfree(encoded_symname); if (rc || d_really_is_negative(lower_dentry)) @@ -499,7 +504,8 @@ out_lock: return rc; } -static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int rc; struct dentry *lower_dentry; @@ -507,7 +513,8 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode); + rc = vfs_mkdir(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, + mode); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); @@ -541,7 +548,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) else if (d_unhashed(lower_dentry)) rc = -EINVAL; else - rc = vfs_rmdir(lower_dir_inode, lower_dentry); + rc = vfs_rmdir(&init_user_ns, lower_dir_inode, lower_dentry); if (!rc) { clear_nlink(d_inode(dentry)); fsstack_copy_attr_times(dir, lower_dir_inode); @@ -555,7 +562,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) } static int -ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { int rc; struct dentry *lower_dentry; @@ -563,7 +571,8 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev); + rc = vfs_mknod(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, + mode, dev); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); @@ -579,9 +588,9 @@ out: } static int -ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { int rc; struct dentry *lower_old_dentry; @@ -590,6 +599,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dir_dentry; struct dentry *trap; struct inode *target_inode; + struct renamedata rd = {}; if (flags) return -EINVAL; @@ -619,9 +629,14 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = -ENOTEMPTY; goto out_lock; } - rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry, - d_inode(lower_new_dir_dentry), lower_new_dentry, - NULL, 0); + + rd.old_mnt_userns = &init_user_ns; + rd.old_dir = d_inode(lower_old_dir_dentry); + rd.old_dentry = lower_old_dentry; + rd.new_mnt_userns = &init_user_ns; + rd.new_dir = d_inode(lower_new_dir_dentry); + rd.new_dentry = lower_new_dentry; + rc = vfs_rename(&rd); if (rc) goto out_lock; if (target_inode) @@ -855,16 +870,19 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); inode_lock(d_inode(lower_dentry)); - rc = notify_change(lower_dentry, &lower_ia, NULL); + rc = notify_change(&init_user_ns, lower_dentry, + &lower_ia, NULL); inode_unlock(d_inode(lower_dentry)); } return rc; } static int -ecryptfs_permission(struct inode *inode, int mask) +ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { - return inode_permission(ecryptfs_inode_to_lower(inode), mask); + return inode_permission(&init_user_ns, + ecryptfs_inode_to_lower(inode), mask); } /** @@ -879,7 +897,8 @@ ecryptfs_permission(struct inode *inode, int mask) * All other metadata changes will be passed right to the lower filesystem, * and we will just update our inode to look like the lower. */ -static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) +static int ecryptfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *ia) { int rc = 0; struct dentry *lower_dentry; @@ -933,7 +952,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } mutex_unlock(&crypt_stat->cs_mutex); - rc = setattr_prepare(dentry, ia); + rc = setattr_prepare(&init_user_ns, dentry, ia); if (rc) goto out; if (ia->ia_valid & ATTR_SIZE) { @@ -959,14 +978,15 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) lower_ia.ia_valid &= ~ATTR_MODE; inode_lock(d_inode(lower_dentry)); - rc = notify_change(lower_dentry, &lower_ia, NULL); + rc = notify_change(&init_user_ns, lower_dentry, &lower_ia, NULL); inode_unlock(d_inode(lower_dentry)); out: fsstack_copy_attr_all(inode, lower_inode); return rc; } -static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat, +static int ecryptfs_getattr_link(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -975,7 +995,7 @@ static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat, mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { char *target; size_t targetsiz; @@ -991,7 +1011,8 @@ static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat, return rc; } -static int ecryptfs_getattr(const struct path *path, struct kstat *stat, +static int ecryptfs_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -1003,7 +1024,7 @@ static int ecryptfs_getattr(const struct path *path, struct kstat *stat, if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); stat->blocks = lower_stat.blocks; } return rc; @@ -1025,7 +1046,7 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, goto out; } inode_lock(lower_inode); - rc = __vfs_setxattr_locked(lower_dentry, name, value, size, flags, NULL); + rc = __vfs_setxattr_locked(&init_user_ns, lower_dentry, name, value, size, flags, NULL); inode_unlock(lower_inode); if (!rc && inode) fsstack_copy_attr_all(inode, lower_inode); @@ -1091,7 +1112,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, struct inode *inode, goto out; } inode_lock(lower_inode); - rc = __vfs_removexattr(lower_dentry, name); + rc = __vfs_removexattr(&init_user_ns, lower_dentry, name); inode_unlock(lower_inode); out: return rc; @@ -1135,6 +1156,7 @@ static int ecryptfs_xattr_get(const struct xattr_handler *handler, } static int ecryptfs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index e63259fdef28..cdf40a54a35d 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -531,6 +531,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } + if (mnt_user_ns(path.mnt) != &init_user_ns) { + rc = -EINVAL; + printk(KERN_ERR "Mounting on idmapped mounts currently disallowed\n"); + goto out_free; + } + if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { rc = -EPERM; printk(KERN_ERR "Mount of device (uid: %d) not owned by " diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 019572c6b39a..2f333a40ff4d 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -426,8 +426,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) if (size < 0) size = 8; put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); - rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, - xattr_virt, size, 0); + rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode, + ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); inode_unlock(lower_inode); if (rc) printk(KERN_ERR "Error whilst attempting to write inode size " diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index feaa5e182b7b..e6bc0302643b 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -137,7 +137,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg) unsigned int oldflags = efivarfs_getflags(inode); int error; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (copy_from_user(&flags, arg, sizeof(flags))) diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 0297ad95eb5c..14e2947975fd 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -66,8 +66,8 @@ bool efivarfs_valid_name(const char *str, int len) return uuid_is_valid(s); } -static int efivarfs_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode = NULL; struct efivar_entry *var; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index ea4f693bee22..f88851c5c250 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -215,10 +215,8 @@ submit_bio_retry: /* max # of continuous pages */ if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); - if (nblocks > BIO_MAX_PAGES) - nblocks = BIO_MAX_PAGES; - bio = bio_alloc(GFP_NOIO, nblocks); + bio = bio_alloc(GFP_NOIO, bio_max_segs(nblocks)); bio->bi_end_io = erofs_readendio; bio_set_dev(bio, sb->s_bdev); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 3e21c0e8adae..119fdce1b520 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -331,8 +331,9 @@ struct inode *erofs_iget(struct super_block *sb, return inode; } -int erofs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) { struct inode *const inode = d_inode(path->dentry); @@ -343,7 +344,7 @@ int erofs_getattr(const struct path *path, struct kstat *stat, stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 67a7ec945686..351dae524a0c 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -373,8 +373,9 @@ extern const struct inode_operations erofs_symlink_iops; extern const struct inode_operations erofs_fast_symlink_iops; struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir); -int erofs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags); +int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags); /* namei.c */ extern const struct inode_operations erofs_dir_iops; diff --git a/fs/exec.c b/fs/exec.c index 5a853f03c233..18594f11c31f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1404,14 +1404,15 @@ EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); - if (inode_permission(inode, MAY_READ) < 0) { + struct user_namespace *mnt_userns = file_mnt_user_ns(file); + if (inode_permission(mnt_userns, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && - !privileged_wrt_inode_uidgid(user_ns, inode)) + !privileged_wrt_inode_uidgid(user_ns, mnt_userns, inode)) user_ns = user_ns->parent; if (old != user_ns) { @@ -1454,7 +1455,7 @@ EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. - * Or, if exec fails before, free_bprm() should release ->cred and + * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) @@ -1579,6 +1580,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ + struct user_namespace *mnt_userns; struct inode *inode; unsigned int mode; kuid_t uid; @@ -1595,13 +1597,15 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) if (!(mode & (S_ISUID|S_ISGID))) return; + mnt_userns = file_mnt_user_ns(file); + /* Be careful if suid/sgid is set */ inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; - uid = inode->i_uid; - gid = inode->i_gid; + uid = i_uid_into_mnt(mnt_userns, inode); + gid = i_gid_into_mnt(mnt_userns, inode); inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ @@ -1837,7 +1841,7 @@ static int bprm_execve(struct linux_binprm *bprm, out: /* - * If past the point of no return ensure the the code never + * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 764bc645241e..fa21421a14d9 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -416,9 +416,11 @@ int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count); extern const struct file_operations exfat_file_operations; int __exfat_truncate(struct inode *inode, loff_t new_size); void exfat_truncate(struct inode *inode, loff_t size); -int exfat_setattr(struct dentry *dentry, struct iattr *attr); -int exfat_getattr(const struct path *path, struct kstat *stat, - unsigned int request_mask, unsigned int query_flags); +int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); +int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, unsigned int request_mask, + unsigned int query_flags); int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); /* namei.c */ diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 183ffdf4d43c..f783cf38dd8e 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -267,13 +267,14 @@ write_size: mutex_unlock(&sbi->s_lock); } -int exfat_getattr(const struct path *path, struct kstat *stat, - unsigned int request_mask, unsigned int query_flags) +int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path, + struct kstat *stat, unsigned int request_mask, + unsigned int query_flags) { struct inode *inode = d_backing_inode(path->dentry); struct exfat_inode_info *ei = EXFAT_I(inode); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; @@ -282,7 +283,8 @@ int exfat_getattr(const struct path *path, struct kstat *stat, return 0; } -int exfat_setattr(struct dentry *dentry, struct iattr *attr) +int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb); struct inode *inode = dentry->d_inode; @@ -305,7 +307,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr) ATTR_TIMES_SET); } - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); attr->ia_valid = ia_valid; if (error) goto out; @@ -340,7 +342,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr) up_write(&EXFAT_I(inode)->truncate_lock); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); exfat_truncate_atime(&inode->i_atime); mark_inode_dirty(inode); diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 2932b23a3b6c..d9e8ec689c55 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -541,8 +541,8 @@ out: return ret; } -static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int exfat_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -827,7 +827,8 @@ unlock: return err; } -static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int exfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -1318,9 +1319,10 @@ out: return ret; } -static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int exfat_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *old_inode, *new_inode; struct super_block *sb = old_dir->i_sb; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index cf4c77f8dd08..b9a9db98e94b 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -216,14 +216,16 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) * inode->i_mutex: down */ int -ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int error; int update_mode = 0; umode_t mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = posix_acl_update_mode(&init_user_ns, inode, &mode, + &acl); if (error) return error; update_mode = 1; diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 0f01c759daac..917db5f6630a 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -56,7 +56,8 @@ static inline int ext2_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); -extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); #else diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 2a4175fbaf5e..3309fb2d327a 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -764,8 +764,9 @@ extern struct inode *ext2_iget (struct super_block *, unsigned long); extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern int ext2_setattr (struct dentry *, struct iattr *); -extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int); +extern int ext2_setattr (struct user_namespace *, struct dentry *, struct iattr *); +extern int ext2_getattr (struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); extern void ext2_set_inode_flags(struct inode *inode); extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 432c3febea6d..df14e750e9fe 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -551,7 +551,7 @@ got: inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; } else - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 78c417d3c898..68178b2234bd 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1638,8 +1638,8 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } -int ext2_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int ext2_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ext2_inode_info *ei = EXT2_I(inode); @@ -1660,16 +1660,17 @@ int ext2_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } -int ext2_setattr(struct dentry *dentry, struct iattr *iattr) +int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, iattr); + error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) return error; @@ -1689,9 +1690,9 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) if (error) return error; } - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); if (iattr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(inode, inode->i_mode); + error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); mark_inode_dirty(inode); return error; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 32a8d10b579d..b399cbb7022d 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -39,7 +39,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (ret) return ret; - if (!inode_owner_or_capable(inode)) { + if (!inode_owner_or_capable(&init_user_ns, inode)) { ret = -EACCES; goto setflags_out; } @@ -84,7 +84,7 @@ setflags_out: case EXT2_IOC_SETVERSION: { __u32 generation; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; ret = mnt_want_write_file(filp); if (ret) @@ -117,7 +117,7 @@ setversion_out: if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index ea980f1e2e99..3367384d344d 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -100,7 +100,9 @@ struct dentry *ext2_get_parent(struct dentry *child) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) +static int ext2_create (struct user_namespace * mnt_userns, + struct inode * dir, struct dentry * dentry, + umode_t mode, bool excl) { struct inode *inode; int err; @@ -118,7 +120,8 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return ext2_add_nondir(dentry, inode); } -static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode = ext2_new_inode(dir, mode, NULL); if (IS_ERR(inode)) @@ -131,7 +134,8 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } -static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) +static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; int err; @@ -151,8 +155,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, return err; } -static int ext2_symlink (struct inode * dir, struct dentry * dentry, - const char * symname) +static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir, + struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; int err = -ENAMETOOLONG; @@ -225,7 +229,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, return err; } -static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) +static int ext2_mkdir(struct user_namespace * mnt_userns, + struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; int err; @@ -315,8 +320,9 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry) return err; } -static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, - struct inode * new_dir, struct dentry * new_dentry, +static int ext2_rename (struct user_namespace * mnt_userns, + struct inode * old_dir, struct dentry * old_dentry, + struct inode * new_dir, struct dentry * new_dentry, unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 9a682e440acb..ebade1f52451 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -19,6 +19,7 @@ ext2_xattr_security_get(const struct xattr_handler *handler, static int ext2_xattr_security_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 49add1107850..18a87d5dd1ab 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -26,6 +26,7 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler, static int ext2_xattr_trusted_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index c243a3b4d69d..58092449f8ff 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -30,6 +30,7 @@ ext2_xattr_user_get(const struct xattr_handler *handler, static int ext2_xattr_user_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/.kunitconfig b/fs/ext4/.kunitconfig new file mode 100644 index 000000000000..bf51da7cd9fc --- /dev/null +++ b/fs/ext4/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_KUNIT_TESTS=y diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 619dd35ddd48..86699c8cab28 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -103,8 +103,7 @@ config EXT4_DEBUG config EXT4_KUNIT_TESTS tristate "KUnit tests for ext4" if !KUNIT_ALL_TESTS - select EXT4_FS - depends on KUNIT + depends on EXT4_FS && KUNIT default KUNIT_ALL_TESTS help This builds the ext4 KUnit tests. diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 68aaed48315f..c5eaffccecc3 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -222,7 +222,8 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) +ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { handle_t *handle; int error, credits, retries = 0; @@ -245,7 +246,7 @@ retry: ext4_fc_start_update(inode); if ((type == ACL_TYPE_ACCESS) && acl) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); if (error) goto out_stop; if (mode != inode->i_mode) diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 9b63f5416a2f..84b8942a57f2 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -56,7 +56,8 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type); -int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2866d249f3d2..644fd69185d3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2755,18 +2755,19 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, /* ialloc.c */ extern int ext4_mark_inode_used(struct super_block *sb, int ino); -extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, +extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *, + struct inode *, umode_t, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, int nblocks); -#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ - __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ - i_flags, 0, 0, 0) -#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode(&init_user_ns, (handle), (dir), (mode), (qstr), \ + (goal), (owner), i_flags, 0, 0, 0) +#define ext4_new_inode_start_handle(mnt_userns, dir, mode, qstr, goal, owner, \ type, nblocks) \ - __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + __ext4_new_inode((mnt_userns), NULL, (dir), (mode), (qstr), (goal), (owner), \ 0, (type), __LINE__, (nblocks)) @@ -2877,11 +2878,14 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, __ext4_iget((sb), (ino), (flags), __func__, __LINE__) extern int ext4_write_inode(struct inode *, struct writeback_control *); -extern int ext4_setattr(struct dentry *, struct iattr *); -extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int ext4_setattr(struct user_namespace *, struct dentry *, + struct iattr *); +extern int ext4_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); -extern int ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int ext4_file_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3960b7ec3ab7..77c7c8a54da7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4382,8 +4382,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, { struct inode *inode = file_inode(file); handle_t *handle; - int ret = 0; - int ret2 = 0, ret3 = 0; + int ret, ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; @@ -4408,7 +4407,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, depth = ext_depth(inode); retry: - while (ret >= 0 && len) { + while (len) { /* * Recalculate credits when extent tree depth changes. */ @@ -4430,9 +4429,13 @@ retry: inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); + ext4_journal_stop(handle); break; } + /* + * allow a full retry cycle for any remaining allocations + */ + retries = 0; map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; @@ -4450,11 +4453,8 @@ retry: if (unlikely(ret2)) break; } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) { - ret = 0; + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - } return ret > 0 ? ret2 : ret; } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 6e8208acfc62..6c4f19b0a556 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -915,13 +915,11 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) struct super_block *sb = (struct super_block *)(journal->j_private); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; - struct list_head *pos; int ret = 0; spin_lock(&sbi->s_fc_lock); ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); - list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { - ei = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); while (atomic_read(&ei->i_fc_updates)) { DEFINE_WAIT(wait); @@ -978,17 +976,15 @@ __releases(&sbi->s_fc_lock) { struct super_block *sb = (struct super_block *)(journal->j_private); struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_fc_dentry_update *fc_dentry; + struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; - struct list_head *pos, *n, *fcd_pos, *fcd_n; - struct ext4_inode_info *ei; + struct ext4_inode_info *ei, *ei_n; int ret; if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) return 0; - list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) { - fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update, - fcd_list); + list_for_each_entry_safe(fc_dentry, fc_dentry_n, + &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { spin_unlock(&sbi->s_fc_lock); if (!ext4_fc_add_dentry_tlv( @@ -1004,8 +1000,8 @@ __releases(&sbi->s_fc_lock) } inode = NULL; - list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { - ei = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN], + i_fc_list) { if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { inode = &ei->vfs_inode; break; @@ -1057,7 +1053,6 @@ static int ext4_fc_perform_commit(journal_t *journal) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; - struct list_head *pos; struct inode *inode; struct blk_plug plug; int ret = 0; @@ -1099,8 +1094,7 @@ static int ext4_fc_perform_commit(journal_t *journal) goto out; } - list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { - iter = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; @@ -1226,9 +1220,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *iter; + struct ext4_inode_info *iter, *iter_n; struct ext4_fc_dentry_update *fc_dentry; - struct list_head *pos, *n; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; @@ -1236,8 +1229,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full) jbd2_fc_release_bufs(journal); spin_lock(&sbi->s_fc_lock); - list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { - iter = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], + i_fc_list) { list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 20f2fcb799f5..633ae7becd61 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -919,7 +919,8 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, +struct inode *__ext4_new_inode(struct user_namespace *mnt_userns, + handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, @@ -969,10 +970,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; - inode->i_uid = current_fsuid(); + inode->i_uid = fsuid_into_mnt(mnt_userns); inode->i_gid = dir->i_gid; } else - inode_init_owner(inode, dir, mode); + inode_init_owner(mnt_userns, inode, dir, mode); if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index de7905284e28..650c5acd2f2d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -20,6 +20,7 @@ */ #include <linux/fs.h> +#include <linux/mount.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> @@ -5315,7 +5316,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * * Called with inode->i_mutex down. */ -int ext4_setattr(struct dentry *dentry, struct iattr *attr) +int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); int error, rc = 0; @@ -5333,7 +5335,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(mnt_userns, dentry, attr); if (error) return error; @@ -5508,7 +5510,7 @@ out_mmap_sem: } if (!error) { - setattr_copy(inode, attr); + setattr_copy(mnt_userns, inode, attr); mark_inode_dirty(inode); } @@ -5520,7 +5522,7 @@ out_mmap_sem: ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(inode, inode->i_mode); + rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode); err_out: if (error) @@ -5531,8 +5533,8 @@ err_out: return error; } -int ext4_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ext4_inode *raw_inode; @@ -5567,17 +5569,18 @@ int ext4_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(inode, stat); + generic_fillattr(mnt_userns, inode, stat); return 0; } -int ext4_file_getattr(const struct path *path, struct kstat *stat, +int ext4_file_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); u64 delalloc_blocks; - ext4_getattr(path, stat, request_mask, query_flags); + ext4_getattr(mnt_userns, path, stat, request_mask, query_flags); /* * If there is inline data in the inode, the inode will normally not diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 713b1ae44c1a..a2cf35066f46 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -107,10 +107,12 @@ void ext4_reset_inode_seed(struct inode *inode) * important fields of the inodes. * * @sb: the super block of the filesystem + * @mnt_userns: user namespace of the mount the inode was found from * @inode: the inode to swap with EXT4_BOOT_LOADER_INO * */ static long swap_inode_boot_loader(struct super_block *sb, + struct user_namespace *mnt_userns, struct inode *inode) { handle_t *handle; @@ -139,7 +141,8 @@ static long swap_inode_boot_loader(struct super_block *sb, } if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || - !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + !inode_owner_or_capable(mnt_userns, inode) || + !capable(CAP_SYS_ADMIN)) { err = -EPERM; goto journal_err_out; } @@ -814,6 +817,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); unsigned int flags; ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); @@ -829,7 +833,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC_SETFLAGS: { int err; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) @@ -871,7 +875,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) __u32 generation; int err; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; if (ext4_has_metadata_csum(inode->i_sb)) { @@ -1010,7 +1014,7 @@ mext_out: case EXT4_IOC_MIGRATE: { int err; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; err = mnt_want_write_file(filp); @@ -1032,7 +1036,7 @@ mext_out: case EXT4_IOC_ALLOC_DA_BLKS: { int err; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; err = mnt_want_write_file(filp); @@ -1051,7 +1055,7 @@ mext_out: err = mnt_want_write_file(filp); if (err) return err; - err = swap_inode_boot_loader(sb, inode); + err = swap_inode_boot_loader(sb, mnt_userns, inode); mnt_drop_write_file(filp); return err; } @@ -1217,7 +1221,7 @@ resizefs_out: case EXT4_IOC_CLEAR_ES_CACHE: { - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; ext4_clear_inode_es(inode); return 0; @@ -1263,7 +1267,7 @@ resizefs_out: return -EFAULT; /* Make sure caller has proper permission */ - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index cf652ba3e74d..686bf982c84e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -731,6 +731,29 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; } + +/* + * Linear search cross check + */ +static inline void htree_rep_invariant_check(struct dx_entry *at, + struct dx_entry *target, + u32 hash, unsigned int n) +{ + while (n--) { + dxtrace(printk(KERN_CONT ",")); + if (dx_get_hash(++at) > hash) { + at--; + break; + } + } + ASSERT(at == target - 1); +} +#else /* DX_DEBUG */ +static inline void htree_rep_invariant_check(struct dx_entry *at, + struct dx_entry *target, + u32 hash, unsigned int n) +{ +} #endif /* DX_DEBUG */ /* @@ -827,20 +850,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, p = m + 1; } - if (0) { // linear search cross check - unsigned n = count - 1; - at = entries; - while (n--) - { - dxtrace(printk(KERN_CONT ",")); - if (dx_get_hash(++at) > hash) - { - at--; - break; - } - } - ASSERT(at == p - 1); - } + htree_rep_invariant_check(entries, p, hash, count - 1); at = p - 1; dxtrace(printk(KERN_CONT " %x->%u\n", @@ -2401,11 +2411,10 @@ again: (frame - 1)->bh); if (err) goto journal_error; - if (restart) { - err = ext4_handle_dirty_dx_node(handle, dir, - frame->bh); + err = ext4_handle_dirty_dx_node(handle, dir, + frame->bh); + if (err) goto journal_error; - } } else { struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, @@ -2596,8 +2605,8 @@ static int ext4_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; struct inode *inode; @@ -2610,8 +2619,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, - NULL, EXT4_HT_DIR, credits); + inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, + 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -2631,8 +2640,8 @@ retry: return err; } -static int ext4_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; @@ -2645,8 +2654,8 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, - NULL, EXT4_HT_DIR, credits); + inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, + 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -2665,7 +2674,8 @@ retry: return err; } -static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; @@ -2676,7 +2686,7 @@ static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return err; retry: - inode = ext4_new_inode_start_handle(dir, mode, + inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + @@ -2774,7 +2784,8 @@ out: return err; } -static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; @@ -2790,7 +2801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, + inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFDIR | mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); @@ -3292,7 +3303,7 @@ out_trace: return retval; } -static int ext4_symlink(struct inode *dir, +static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const char *symname) { handle_t *handle; @@ -3333,7 +3344,7 @@ static int ext4_symlink(struct inode *dir, EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; } - inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, + inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); @@ -3662,7 +3673,8 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) } } -static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent, +static struct inode *ext4_whiteout_for_rename(struct user_namespace *mnt_userns, + struct ext4_renament *ent, int credits, handle_t **h) { struct inode *wh; @@ -3676,7 +3688,8 @@ static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent, credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + EXT4_XATTR_TRANS_BLOCKS + 4); retry: - wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE, + wh = ext4_new_inode_start_handle(mnt_userns, ent->dir, + S_IFCHR | WHITEOUT_MODE, &ent->dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -3703,9 +3716,9 @@ retry: * while new_{dentry,inode) refers to the destination dentry/inode * This comes from rename(const char *oldpath, const char *newpath) */ -static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { handle_t *handle = NULL; struct ext4_renament old = { @@ -3789,7 +3802,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } } else { - whiteout = ext4_whiteout_for_rename(&old, credits, &handle); + whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle); if (IS_ERR(whiteout)) { retval = PTR_ERR(whiteout); whiteout = NULL; @@ -4085,7 +4098,8 @@ end_rename: return retval; } -static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, +static int ext4_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -4107,7 +4121,7 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, new_dir, new_dentry); } - return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags); + return ext4_rename(mnt_userns, old_dir, old_dentry, new_dir, new_dentry, flags); } /* diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index f014c5e473a9..3db923403505 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -371,8 +371,7 @@ int ext4_mpage_readpages(struct inode *inode, * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ - bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES)); + bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages)); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, page->index); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fb5985102c1d..ad34a37278cd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -59,7 +59,7 @@ #include <trace/events/ext4.h> static struct ext4_lazy_init *ext4_li_info; -static struct mutex ext4_li_mtx; +static DEFINE_MUTEX(ext4_li_mtx); static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, @@ -4875,7 +4875,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); - sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; sbi->s_journal->j_finish_inode_data_buffers = @@ -4987,6 +4986,14 @@ no_journal: goto failed_mount5; } + /* + * We can only set up the journal commit callback once + * mballoc is initialized + */ + if (sbi->s_journal) + sbi->s_journal->j_commit_callback = + ext4_journal_commit_callback; + block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); @@ -6654,7 +6661,7 @@ static struct file_system_type ext4_fs_type = { .name = "ext4", .mount = ext4_mount, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ext4"); @@ -6667,7 +6674,6 @@ static int __init ext4_init_fs(void) ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; - mutex_init(&ext4_li_mtx); /* Build-time check for flags consistency */ ext4_check_flag_values(); diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c index 8cfa74a56361..c78df5790377 100644 --- a/fs/ext4/xattr_hurd.c +++ b/fs/ext4/xattr_hurd.c @@ -32,6 +32,7 @@ ext4_xattr_hurd_get(const struct xattr_handler *handler, static int ext4_xattr_hurd_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 197a9d8a15ef..8213f66f7b2d 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -23,6 +23,7 @@ ext4_xattr_security_get(const struct xattr_handler *handler, static int ext4_xattr_security_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index e9389e5d75c3..7c21ffb26d25 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -30,6 +30,7 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler, static int ext4_xattr_trusted_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index d4546184b34b..2fe7ff0a479c 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -31,6 +31,7 @@ ext4_xattr_user_get(const struct xattr_handler *handler, static int ext4_xattr_user_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 732ec10e7890..965037a9c205 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -214,8 +214,8 @@ static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, return error; if (error == 0) *acl = NULL; - if (!in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; @@ -269,7 +269,8 @@ static int __f2fs_set_acl(struct inode *inode, int type, return error; } -int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 124868c13f80..986fd1bc780b 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -34,7 +34,8 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int); -extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); +extern int f2fs_set_acl(struct user_namespace *, struct inode *, + struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); #else diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b9721c8f116c..7c95818639a6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -969,8 +969,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned int post_read_steps = 0; bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES), - &f2fs_bioset); + bio_max_segs(nr_pages), &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 506c801880f3..e2d302ae3a46 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3187,9 +3187,10 @@ void f2fs_truncate_data_blocks(struct dnode_of_data *dn); int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); -int f2fs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); -int f2fs_setattr(struct dentry *dentry, struct iattr *attr); +int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 471a6ff0c937..d26ff2ae3f5e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -789,8 +789,8 @@ int f2fs_truncate(struct inode *inode) return 0; } -int f2fs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -826,7 +826,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); /* we need to show initial sectors used for inline_data/dentries */ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || @@ -837,7 +837,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, } #ifdef CONFIG_F2FS_FS_POSIX_ACL -static void __setattr_copy(struct inode *inode, const struct iattr *attr) +static void __setattr_copy(struct user_namespace *mnt_userns, + struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; @@ -853,9 +854,9 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; + kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (!in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + if (!in_group_p(kgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -864,7 +865,8 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) #define __setattr_copy setattr_copy #endif -int f2fs_setattr(struct dentry *dentry, struct iattr *attr) +int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; @@ -884,7 +886,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; @@ -960,10 +962,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) spin_unlock(&F2FS_I(inode)->i_size_lock); } - __setattr_copy(inode, attr); + __setattr_copy(&init_user_ns, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); + err = posix_acl_chmod(&init_user_ns, inode, f2fs_get_inode_mode(inode)); if (is_inode_flag_set(inode, FI_ACL_MODE)) { if (!err) @@ -1978,7 +1980,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) u32 iflags; int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (get_user(fsflags, (int __user *)arg)) @@ -2025,7 +2027,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (!S_ISREG(inode->i_mode)) @@ -2092,7 +2094,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) struct inode *inode = file_inode(filp); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2134,7 +2136,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) struct inode *inode = file_inode(filp); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (!S_ISREG(inode->i_mode)) @@ -2169,7 +2171,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp) struct inode *inode = file_inode(filp); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2198,7 +2200,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) struct inode *inode = file_inode(filp); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -3175,7 +3177,7 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) return -EFAULT; /* Make sure caller has proper permission */ - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (fa.fsx_xflags & ~F2FS_SUPPORTED_XFLAGS) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 887804968576..17bd072a5d39 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -46,7 +46,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_free = true; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; @@ -314,8 +314,8 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, } } -static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -637,8 +637,8 @@ static const char *f2fs_get_link(struct dentry *dentry, return link; } -static int f2fs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -717,7 +717,8 @@ out_free_encrypted_link: return err; } -static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -770,8 +771,8 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) return -ENOTEMPTY; } -static int f2fs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -878,7 +879,8 @@ out: return err; } -static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -1255,7 +1257,8 @@ out: return err; } -static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, +static int f2fs_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a8a0fb890e8d..4b0e2e3c2c88 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2747,7 +2747,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, BIO_MAX_PAGES); + nrpages = bio_max_segs(last_offset - i); /* readahead node pages */ f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8159fae74b9a..490f843ec3bf 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -64,6 +64,7 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, } static int f2fs_xattr_generic_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -107,6 +108,7 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler, } static int f2fs_xattr_advise_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -114,7 +116,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, unsigned char old_advise = F2FS_I(inode)->i_advise; unsigned char new_advise; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; if (value == NULL) return -EINVAL; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 922a0c6ba46c..02d4d4234956 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -397,9 +397,11 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; -extern int fat_setattr(struct dentry *dentry, struct iattr *attr); +extern int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); extern void fat_truncate_blocks(struct inode *inode, loff_t offset); -extern int fat_getattr(const struct path *path, struct kstat *stat, +extern int fat_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/fat/file.c b/fs/fat/file.c index 5fee74f1ad61..13855ba49cd9 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -95,7 +95,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) goto out_unlock_inode; /* This MUST be done before doing anything irreversible... */ - err = fat_setattr(file->f_path.dentry, &ia); + err = fat_setattr(file_mnt_user_ns(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; @@ -394,11 +394,11 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) fat_flush_inodes(inode->i_sb, inode, NULL); } -int fat_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int fat_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(inode, stat); + generic_fillattr(mnt_userns, inode, stat); stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) { @@ -447,12 +447,13 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi, return 0; } -static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) +static int fat_allow_set_time(struct user_namespace *mnt_userns, + struct msdos_sb_info *sbi, struct inode *inode) { umode_t allow_utime = sbi->options.allow_utime; - if (!uid_eq(current_fsuid(), inode->i_uid)) { - if (in_group_p(inode->i_gid)) + if (!uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { + if (in_group_p(i_gid_into_mnt(mnt_userns, inode))) allow_utime >>= 3; if (allow_utime & MAY_WRITE) return 1; @@ -466,7 +467,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) /* valid file mode bits */ #define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) -int fat_setattr(struct dentry *dentry, struct iattr *attr) +int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); struct inode *inode = d_inode(dentry); @@ -476,11 +478,11 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) /* Check for setting the inode time. */ ia_valid = attr->ia_valid; if (ia_valid & TIMES_SET_FLAGS) { - if (fat_allow_set_time(sbi, inode)) + if (fat_allow_set_time(mnt_userns, sbi, inode)) attr->ia_valid &= ~TIMES_SET_FLAGS; } - error = setattr_prepare(dentry, attr); + error = setattr_prepare(mnt_userns, dentry, attr); attr->ia_valid = ia_valid; if (error) { if (sbi->options.quiet) @@ -550,7 +552,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) fat_truncate_time(inode, &attr->ia_mtime, S_MTIME); attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME); - setattr_copy(inode, attr); + setattr_copy(mnt_userns, inode, attr); mark_inode_dirty(inode); out: return error; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 9d062886fbc1..efba301d68ae 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -261,8 +261,8 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, } /***** Create a file */ -static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int msdos_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode = NULL; @@ -339,7 +339,8 @@ out: } /***** Make a directory */ -static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int msdos_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; @@ -593,7 +594,8 @@ error_inode: } /***** Rename, a wrapper for rename_same_dir & rename_diff_dir */ -static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, +static int msdos_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -665,7 +667,7 @@ static struct file_system_type msdos_fs_type = { .name = "msdos", .mount = msdos_mount, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("msdos"); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 0cdd0fb9f742..5369d82e0bfb 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -756,8 +756,8 @@ error: return ERR_PTR(err); } -static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -846,7 +846,8 @@ out: return err; } -static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -892,9 +893,9 @@ out: return err; } -static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct buffer_head *dotdot_bh; struct msdos_dir_entry *dotdot_de; @@ -1062,7 +1063,7 @@ static struct file_system_type vfat_fs_type = { .name = "vfat", .mount = vfat_mount, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("vfat"); diff --git a/fs/fcntl.c b/fs/fcntl.c index 483ef8861376..dfc72f15be7f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -25,6 +25,7 @@ #include <linux/user_namespace.h> #include <linux/memfd.h> #include <linux/compat.h> +#include <linux/mount.h> #include <linux/poll.h> #include <asm/siginfo.h> @@ -46,7 +47,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(file_mnt_user_ns(filp), inode)) return -EPERM; /* required for strict SunOS emulation */ diff --git a/fs/fhandle.c b/fs/fhandle.c index 01263ffbc4c0..ec6feeccc276 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -173,7 +173,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, /* * With handle we don't look at the execute bit on the - * the directory. Ideally we would like CAP_DAC_SEARCH. + * directory. Ideally we would like CAP_DAC_SEARCH. * But we don't have that */ if (!capable(CAP_DAC_READ_SEARCH)) { diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index f529075a2ce8..e9c0f916349d 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -50,7 +50,8 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type) return acl; } -int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { struct fuse_conn *fc = get_fuse_conn(inode); const char *name; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 588f8d1240aa..c6636b4c4ccf 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -844,11 +844,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (WARN_ON(PageMlocked(oldpage))) goto out_fallback_unlock; - err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL); - if (err) { - unlock_page(newpage); - goto out_put_old; - } + replace_page_cache_page(oldpage, newpage); get_page(newpage); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 78f9f209078c..06a18700a845 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -605,7 +605,8 @@ out_err: return err; } -static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t); +static int fuse_mknod(struct user_namespace *, struct inode *, struct dentry *, + umode_t, dev_t); static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, umode_t mode) @@ -645,7 +646,7 @@ out_dput: return err; mknod: - err = fuse_mknod(dir, entry, mode, 0); + err = fuse_mknod(&init_user_ns, dir, entry, mode, 0); if (err) goto out_dput; no_open: @@ -715,8 +716,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, return err; } -static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, - dev_t rdev) +static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); @@ -738,13 +739,14 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, return create_new_entry(fm, &args, dir, entry, mode); } -static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, - bool excl) +static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode, bool excl) { - return fuse_mknod(dir, entry, mode, 0); + return fuse_mknod(&init_user_ns, dir, entry, mode, 0); } -static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) +static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); @@ -765,8 +767,8 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) return create_new_entry(fm, &args, dir, entry, S_IFDIR); } -static int fuse_symlink(struct inode *dir, struct dentry *entry, - const char *link) +static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, const char *link) { struct fuse_mount *fm = get_fuse_mount(dir); unsigned len = strlen(link) + 1; @@ -908,9 +910,9 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename2(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent, - unsigned int flags) +static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir, + struct dentry *oldent, struct inode *newdir, + struct dentry *newent, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(olddir); int err; @@ -1087,7 +1089,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; } @@ -1249,7 +1251,8 @@ static int fuse_perm_getattr(struct inode *inode, int mask) * access request is sent. Execute permission is still checked * locally based on file mode. */ -static int fuse_permission(struct inode *inode, int mask) +static int fuse_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); bool refreshed = false; @@ -1280,7 +1283,7 @@ static int fuse_permission(struct inode *inode, int mask) } if (fc->default_permissions) { - err = generic_permission(inode, mask); + err = generic_permission(&init_user_ns, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1288,7 +1291,8 @@ static int fuse_permission(struct inode *inode, int mask) if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) - err = generic_permission(inode, mask); + err = generic_permission(&init_user_ns, + inode, mask); } /* Note: the opposite of the above test does not @@ -1610,7 +1614,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; @@ -1756,7 +1760,8 @@ error: return err; } -static int fuse_setattr(struct dentry *entry, struct iattr *attr) +static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry, + struct iattr *attr) { struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1818,7 +1823,8 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) return ret; } -static int fuse_getattr(const struct path *path, struct kstat *stat, +static int fuse_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7c4b8cb93f9f..68cca8d4db6e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1180,8 +1180,8 @@ extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type); -int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); - +int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); /* readdir.c */ int fuse_readdir(struct file *file, struct dir_context *ctx); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index cdea18de94f7..1a7d7ace54e1 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -188,6 +188,7 @@ static int fuse_xattr_get(const struct xattr_handler *handler, } static int fuse_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -214,6 +215,7 @@ static int no_xattr_get(const struct xattr_handler *handler, } static int no_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *nodee, const char *name, const void *value, size_t size, int flags) diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 2e939f5fe751..9165d70ead07 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -106,7 +106,8 @@ out: return error; } -int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; @@ -130,7 +131,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(inode, &mode, &acl); + ret = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl); if (ret) goto unlock; } diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 61353a1501c5..eccc6a43326c 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -13,6 +13,7 @@ extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 62d9081d1e26..7a358ae05185 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1230,6 +1230,9 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, gfs2_inplace_release(ip); + if (ip->i_qadata && ip->i_qadata->qa_qd_num) + gfs2_quota_unlock(ip); + if (length != written && (iomap->flags & IOMAP_F_NEW)) { /* Deallocate blocks that were just allocated. */ loff_t blockmask = i_blocksize(inode) - 1; @@ -1242,9 +1245,6 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, } } - if (ip->i_qadata && ip->i_qadata->qa_qd_num) - gfs2_quota_unlock(ip); - if (unlikely(!written)) goto out_unlock; @@ -1538,13 +1538,13 @@ more_rgrps: goto out; } ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, - 0, rd_gh); + LM_FLAG_NODE_SCOPE, rd_gh); if (ret) goto out; /* Must be done with the rgrp glock held: */ if (gfs2_rs_active(&ip->i_res) && - rgd == ip->i_res.rs_rbm.rgd) + rgd == ip->i_res.rs_rgd) gfs2_rs_deltree(&ip->i_res); } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 07f49e5e6304..2d500f90cdac 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -238,7 +238,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask, goto out; error = -EACCES; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) goto out; error = 0; @@ -256,7 +256,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask, !capable(CAP_LINUX_IMMUTABLE)) goto out; if (!IS_IMMUTABLE(inode)) { - error = gfs2_permission(inode, MAY_WRITE); + error = gfs2_permission(&init_user_ns, inode, MAY_WRITE); if (error) goto out; } @@ -716,10 +716,10 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if (file->f_mode & FMODE_WRITE) { + if (gfs2_rs_active(&ip->i_res)) gfs2_rs_delete(ip, &inode->i_writecount); + if (file->f_mode & FMODE_WRITE) gfs2_qa_put(ip); - } return 0; } @@ -1112,8 +1112,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t goto out_qunlock; /* check if the selected rgrp limits our max_blks further */ - if (ap.allowed && ap.allowed < max_blks) - max_blks = ap.allowed; + if (ip->i_res.rs_reserved < max_blks) + max_blks = ip->i_res.rs_reserved; /* Almost done. Calculate bytes that can be written using * max_blks. We also recompute max_bytes, data_blocks and diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d87a5bc3607b..9567520d79f7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -313,9 +313,23 @@ void gfs2_glock_put(struct gfs2_glock *gl) static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh) { const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list); - if ((gh->gh_state == LM_ST_EXCLUSIVE || - gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head) - return 0; + + if (gh != gh_head) { + /** + * Here we make a special exception to grant holders who agree + * to share the EX lock with other holders who also have the + * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit + * is set, we grant more holders with the bit set. + */ + if (gh_head->gh_state == LM_ST_EXCLUSIVE && + (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) && + gh->gh_state == LM_ST_EXCLUSIVE && + (gh->gh_flags & LM_FLAG_NODE_SCOPE)) + return 1; + if ((gh->gh_state == LM_ST_EXCLUSIVE || + gh_head->gh_state == LM_ST_EXCLUSIVE)) + return 0; + } if (gl->gl_state == gh->gh_state) return 1; if (gh->gh_flags & GL_EXACT) @@ -2030,6 +2044,8 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) *p++ = 'A'; if (flags & LM_FLAG_PRIORITY) *p++ = 'p'; + if (flags & LM_FLAG_NODE_SCOPE) + *p++ = 'n'; if (flags & GL_ASYNC) *p++ = 'a'; if (flags & GL_EXACT) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 53813364517b..31a8f2f649b5 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -75,6 +75,11 @@ enum { * request and directly join the other shared lock. A shared lock request * without the priority flag might be forced to wait until the deferred * requested had acquired and released the lock. + * + * LM_FLAG_NODE_SCOPE + * This holder agrees to share the lock within this node. In other words, + * the glock is held in EX mode according to DLM, but local holders on the + * same node can share it. */ #define LM_FLAG_TRY 0x0001 @@ -82,6 +87,7 @@ enum { #define LM_FLAG_NOEXP 0x0004 #define LM_FLAG_ANY 0x0008 #define LM_FLAG_PRIORITY 0x0010 +#define LM_FLAG_NODE_SCOPE 0x0020 #define GL_ASYNC 0x0040 #define GL_EXACT 0x0080 #define GL_SKIP 0x0100 diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 3faa421568b0..8e32d569c8bf 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -86,16 +86,12 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_trans tr; + unsigned int revokes; int ret; - memset(&tr, 0, sizeof(tr)); - INIT_LIST_HEAD(&tr.tr_buf); - INIT_LIST_HEAD(&tr.tr_databuf); - INIT_LIST_HEAD(&tr.tr_ail1_list); - INIT_LIST_HEAD(&tr.tr_ail2_list); - tr.tr_revokes = atomic_read(&gl->gl_ail_count); + revokes = atomic_read(&gl->gl_ail_count); - if (!tr.tr_revokes) { + if (!revokes) { bool have_revokes; bool log_in_flight; @@ -122,20 +118,14 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl) return 0; } - /* A shortened, inline version of gfs2_trans_begin() - * tr->alloced is not set since the transaction structure is - * on the stack */ - tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes); - tr.tr_ip = _RET_IP_; - ret = gfs2_log_reserve(sdp, tr.tr_reserved); - if (ret < 0) - return ret; - WARN_ON_ONCE(current->journal_info); - current->journal_info = &tr; - - __gfs2_ail_flush(gl, 0, tr.tr_revokes); - + memset(&tr, 0, sizeof(tr)); + set_bit(TR_ONSTACK, &tr.tr_flags); + ret = __gfs2_trans_begin(&tr, sdp, 0, revokes, _RET_IP_); + if (ret) + goto flush; + __gfs2_ail_flush(gl, 0, revokes); gfs2_trans_end(sdp); + flush: gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_AIL_EMPTY_GL); @@ -146,19 +136,15 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; unsigned int revokes = atomic_read(&gl->gl_ail_count); - unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); int ret; if (!revokes) return; - while (revokes > max_revokes) - max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); - - ret = gfs2_trans_begin(sdp, 0, max_revokes); + ret = gfs2_trans_begin(sdp, 0, revokes); if (ret) return; - __gfs2_ail_flush(gl, fsync, max_revokes); + __gfs2_ail_flush(gl, fsync, revokes); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_AIL_FLUSH); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 8e1ab8ed4abc..0957119f7744 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -20,6 +20,7 @@ #include <linux/percpu.h> #include <linux/lockref.h> #include <linux/rhashtable.h> +#include <linux/mutex.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -106,7 +107,8 @@ struct gfs2_rgrpd { u32 rd_data; /* num of data blocks in rgrp */ u32 rd_bitbytes; /* number of bytes in data bitmaps */ u32 rd_free; - u32 rd_reserved; /* number of blocks reserved */ + u32 rd_requested; /* number of blocks in rd_rstree */ + u32 rd_reserved; /* number of reserved blocks */ u32 rd_free_clone; u32 rd_dinodes; u64 rd_igeneration; @@ -122,34 +124,10 @@ struct gfs2_rgrpd { #define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */ #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ spinlock_t rd_rsspin; /* protects reservation related vars */ + struct mutex rd_mutex; struct rb_root rd_rstree; /* multi-block reservation tree */ }; -struct gfs2_rbm { - struct gfs2_rgrpd *rgd; - u32 offset; /* The offset is bitmap relative */ - int bii; /* Bitmap index */ -}; - -static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm) -{ - return rbm->rgd->rd_bits + rbm->bii; -} - -static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) -{ - BUG_ON(rbm->offset >= rbm->rgd->rd_data); - return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + - rbm->offset; -} - -static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1, - const struct gfs2_rbm *rbm2) -{ - return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) && - (rbm1->offset == rbm2->offset); -} - enum gfs2_state_bits { BH_Pinned = BH_PrivateStart, BH_Escaped = BH_PrivateStart + 1, @@ -313,9 +291,11 @@ struct gfs2_qadata { /* quota allocation data */ */ struct gfs2_blkreserv { - struct rb_node rs_node; /* link to other block reservations */ - struct gfs2_rbm rs_rbm; /* Start of reservation */ - u32 rs_free; /* how many blocks are still free */ + struct rb_node rs_node; /* node within rd_rstree */ + struct gfs2_rgrpd *rs_rgd; + u64 rs_start; + u32 rs_requested; + u32 rs_reserved; /* number of reserved blocks */ }; /* @@ -490,7 +470,7 @@ struct gfs2_quota_data { enum { TR_TOUCHED = 1, TR_ATTACHED = 2, - TR_ALLOCED = 3, + TR_ONSTACK = 3, }; struct gfs2_trans { @@ -506,7 +486,6 @@ struct gfs2_trans { unsigned int tr_num_buf_rm; unsigned int tr_num_databuf_rm; unsigned int tr_num_revoke; - unsigned int tr_num_revoke_rm; struct list_head tr_list; struct list_head tr_databuf; @@ -531,6 +510,7 @@ struct gfs2_jdesc { unsigned int nr_extents; struct work_struct jd_work; struct inode *jd_inode; + struct bio *jd_log_bio; unsigned long jd_flags; #define JDF_RECOVERY 1 unsigned int jd_jid; @@ -585,6 +565,7 @@ struct gfs2_args { unsigned int ar_errors:2; /* errors=withdraw | panic */ unsigned int ar_nobarrier:1; /* do not send barriers */ unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ + unsigned int ar_got_rgrplvb:1; /* Was the rgrplvb opt given? */ unsigned int ar_loccookie:1; /* use location based readdir cookies */ s32 ar_commit; /* Commit interval */ @@ -821,7 +802,6 @@ struct gfs2_sbd { struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; - int sd_log_committed_revoke; atomic_t sd_log_pinned; unsigned int sd_log_num_revoke; @@ -834,24 +814,22 @@ struct gfs2_sbd { atomic_t sd_log_thresh2; atomic_t sd_log_blks_free; atomic_t sd_log_blks_needed; + atomic_t sd_log_revokes_available; wait_queue_head_t sd_log_waitq; wait_queue_head_t sd_logd_waitq; u64 sd_log_sequence; - unsigned int sd_log_head; - unsigned int sd_log_tail; int sd_log_idle; struct rw_semaphore sd_log_flush_lock; atomic_t sd_log_in_flight; - struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; int sd_log_error; /* First log error */ wait_queue_head_t sd_withdraw_wait; - atomic_t sd_reserving_log; - wait_queue_head_t sd_reserving_log_wait; - + unsigned int sd_log_tail; + unsigned int sd_log_flush_tail; + unsigned int sd_log_head; unsigned int sd_log_flush_head; spinlock_t sd_ail_lock; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c1b77e8d6b1c..c9775d5c6594 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -325,7 +325,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, } if (!is_root) { - error = gfs2_permission(dir, MAY_EXEC); + error = gfs2_permission(&init_user_ns, dir, MAY_EXEC); if (error) goto out; } @@ -355,7 +355,8 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, { int error; - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&init_user_ns, &dip->i_inode, + MAY_WRITE | MAY_EXEC); if (error) return error; @@ -490,8 +491,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, di = (struct gfs2_dinode *)dibh->b_data; gfs2_dinode_out(ip, di); - di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev)); - di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev)); + di->di_major = cpu_to_be32(imajor(&ip->i_inode)); + di->di_minor = cpu_to_be32(iminor(&ip->i_inode)); di->__pad1 = 0; di->__pad2 = 0; di->__pad3 = 0; @@ -843,8 +844,8 @@ fail: * Returns: errno */ -static int gfs2_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +static int gfs2_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl); } @@ -951,7 +952,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) goto out_gunlock; - error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -1068,7 +1069,8 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, if (IS_APPEND(&dip->i_inode)) return -EPERM; - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&init_user_ns, &dip->i_inode, + MAY_WRITE | MAY_EXEC); if (error) return error; @@ -1145,7 +1147,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) if (!rgd) goto out_inodes; - gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE, ghs + 2); error = gfs2_glock_nq(ghs); /* parent */ @@ -1204,8 +1206,8 @@ out_inodes: * Returns: errno */ -static int gfs2_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { unsigned int size; @@ -1225,7 +1227,8 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, * Returns: errno */ -static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir)); return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0); @@ -1240,8 +1243,8 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) * */ -static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t dev) +static int gfs2_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0); } @@ -1450,8 +1453,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = -ENOENT; goto out_gunlock; } - error = gfs2_glock_nq_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, - &rd_gh); + error = gfs2_glock_nq_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &rd_gh); if (error) goto out_gunlock; } @@ -1490,7 +1493,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } } } else { - error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&init_user_ns, ndir, + MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -1525,7 +1529,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Check out the dir to be renamed */ if (dir_rename) { - error = gfs2_permission(d_inode(odentry), MAY_WRITE); + error = gfs2_permission(&init_user_ns, d_inode(odentry), + MAY_WRITE); if (error) goto out_gunlock; } @@ -1688,12 +1693,14 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry, goto out_gunlock; if (S_ISDIR(old_mode)) { - error = gfs2_permission(odentry->d_inode, MAY_WRITE); + error = gfs2_permission(&init_user_ns, odentry->d_inode, + MAY_WRITE); if (error) goto out_gunlock; } if (S_ISDIR(new_mode)) { - error = gfs2_permission(ndentry->d_inode, MAY_WRITE); + error = gfs2_permission(&init_user_ns, ndentry->d_inode, + MAY_WRITE); if (error) goto out_gunlock; } @@ -1747,9 +1754,9 @@ out: return error; } -static int gfs2_rename2(struct inode *odir, struct dentry *odentry, - struct inode *ndir, struct dentry *ndentry, - unsigned int flags) +static int gfs2_rename2(struct user_namespace *mnt_userns, struct inode *odir, + struct dentry *odentry, struct inode *ndir, + struct dentry *ndentry, unsigned int flags) { flags &= ~RENAME_NOREPLACE; @@ -1833,7 +1840,8 @@ out: * Returns: errno */ -int gfs2_permission(struct inode *inode, int mask) +int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { struct gfs2_inode *ip; struct gfs2_holder i_gh; @@ -1852,7 +1860,7 @@ int gfs2_permission(struct inode *inode, int mask) if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) error = -EPERM; else - error = generic_permission(inode, mask); + error = generic_permission(&init_user_ns, inode, mask); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); @@ -1861,7 +1869,7 @@ int gfs2_permission(struct inode *inode, int mask) static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } @@ -1963,7 +1971,8 @@ out: * Returns: errno */ -static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) +static int gfs2_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); @@ -1982,7 +1991,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) goto error; @@ -1993,7 +2002,8 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) else { error = gfs2_setattr_simple(inode, attr); if (!error && attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(inode, inode->i_mode); + error = posix_acl_chmod(&init_user_ns, inode, + inode->i_mode); } error: @@ -2007,6 +2017,7 @@ out: /** * gfs2_getattr - Read out an inode's attributes + * @mnt_userns: user namespace of the mount the inode was found from * @path: Object to query * @stat: The inode's stats * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -2021,7 +2032,8 @@ out: * Returns: errno */ -static int gfs2_getattr(const struct path *path, struct kstat *stat, +static int gfs2_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); @@ -2049,7 +2061,7 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 8073b8d2c7fa..c447bd5b3017 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -99,7 +99,8 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip); extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); -extern int gfs2_permission(struct inode *inode, int mask); +extern int gfs2_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 9f2b5609f225..153272f82984 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -284,7 +284,6 @@ static void gdlm_put_lock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; - int lvb_needs_unlock = 0; int error; if (gl->gl_lksb.sb_lkid == 0) { @@ -297,13 +296,10 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); - /* don't want to skip dlm_unlock writing the lvb when lock is ex */ - - if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE)) - lvb_needs_unlock = 1; + /* don't want to skip dlm_unlock writing the lvb when lock has one */ if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - !lvb_needs_unlock) { + !gl->gl_lksb.sb_lvbptr) { gfs2_glock_free(gl); return; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 2e9314091c81..16937ebb2a3e 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -50,10 +50,12 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct) unsigned int blks; unsigned int first, second; + /* The initial struct gfs2_log_descriptor block */ blks = 1; first = sdp->sd_ldptrs; if (nstruct > first) { + /* Subsequent struct gfs2_meta_header blocks */ second = sdp->sd_inptrs; blks += DIV_ROUND_UP(nstruct - first, second); } @@ -89,7 +91,7 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct writeback_control *wbc, - struct gfs2_trans *tr) + struct gfs2_trans *tr, struct blk_plug *plug) __releases(&sdp->sd_ail_lock) __acquires(&sdp->sd_ail_lock) { @@ -131,6 +133,11 @@ __acquires(&sdp->sd_ail_lock) continue; spin_unlock(&sdp->sd_ail_lock); ret = generic_writepages(mapping, wbc); + if (need_resched()) { + blk_finish_plug(plug); + cond_resched(); + blk_start_plug(plug); + } spin_lock(&sdp->sd_ail_lock); if (ret == -ENODATA) /* if a jdata write into a new hole */ ret = 0; /* ignore it */ @@ -205,7 +212,7 @@ restart: list_for_each_entry_reverse(tr, head, tr_list) { if (wbc->nr_to_write <= 0) break; - ret = gfs2_ail1_start_one(sdp, wbc, tr); + ret = gfs2_ail1_start_one(sdp, wbc, tr, &plug); if (ret) { if (ret == -EBUSY) goto restart; @@ -240,6 +247,45 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) return gfs2_ail1_flush(sdp, &wbc); } +static void gfs2_log_update_flush_tail(struct gfs2_sbd *sdp) +{ + unsigned int new_flush_tail = sdp->sd_log_head; + struct gfs2_trans *tr; + + if (!list_empty(&sdp->sd_ail1_list)) { + tr = list_last_entry(&sdp->sd_ail1_list, + struct gfs2_trans, tr_list); + new_flush_tail = tr->tr_first; + } + sdp->sd_log_flush_tail = new_flush_tail; +} + +static void gfs2_log_update_head(struct gfs2_sbd *sdp) +{ + unsigned int new_head = sdp->sd_log_flush_head; + + if (sdp->sd_log_flush_tail == sdp->sd_log_head) + sdp->sd_log_flush_tail = new_head; + sdp->sd_log_head = new_head; +} + +/** + * gfs2_ail_empty_tr - empty one of the ail lists of a transaction + */ + +static void gfs2_ail_empty_tr(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + struct list_head *head) +{ + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, + bd_ail_st_list); + gfs2_assert(sdp, bd->bd_tr == tr); + gfs2_remove_from_ail(bd); + } +} + /** * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced * @sdp: the filesystem @@ -315,6 +361,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) else oldest_tr = 0; } + gfs2_log_update_flush_tail(sdp); ret = list_empty(&sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); @@ -348,47 +395,69 @@ static void gfs2_ail1_wait(struct gfs2_sbd *sdp) spin_unlock(&sdp->sd_ail_lock); } -/** - * gfs2_ail_empty_tr - empty one of the ail lists for a transaction - */ - -static void gfs2_ail_empty_tr(struct gfs2_sbd *sdp, struct gfs2_trans *tr, - struct list_head *head) +static void __ail2_empty(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct gfs2_bufdata *bd; - - while (!list_empty(head)) { - bd = list_first_entry(head, struct gfs2_bufdata, - bd_ail_st_list); - gfs2_assert(sdp, bd->bd_tr == tr); - gfs2_remove_from_ail(bd); - } + gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); + list_del(&tr->tr_list); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); + gfs2_trans_free(sdp, tr); } static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) { - struct gfs2_trans *tr, *safe; + struct list_head *ail2_list = &sdp->sd_ail2_list; unsigned int old_tail = sdp->sd_log_tail; - int wrap = (new_tail < old_tail); - int a, b, rm; + struct gfs2_trans *tr, *safe; spin_lock(&sdp->sd_ail_lock); + if (old_tail <= new_tail) { + list_for_each_entry_safe(tr, safe, ail2_list, tr_list) { + if (old_tail <= tr->tr_first && tr->tr_first < new_tail) + __ail2_empty(sdp, tr); + } + } else { + list_for_each_entry_safe(tr, safe, ail2_list, tr_list) { + if (old_tail <= tr->tr_first || tr->tr_first < new_tail) + __ail2_empty(sdp, tr); + } + } + spin_unlock(&sdp->sd_ail_lock); +} - list_for_each_entry_safe(tr, safe, &sdp->sd_ail2_list, tr_list) { - a = (old_tail <= tr->tr_first); - b = (tr->tr_first < new_tail); - rm = (wrap) ? (a || b) : (a && b); - if (!rm) - continue; +/** + * gfs2_log_is_empty - Check if the log is empty + * @sdp: The GFS2 superblock + */ - gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); - list_del(&tr->tr_list); - gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); - gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); - gfs2_trans_free(sdp, tr); +bool gfs2_log_is_empty(struct gfs2_sbd *sdp) { + return atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks; +} + +static bool __gfs2_log_try_reserve_revokes(struct gfs2_sbd *sdp, unsigned int revokes) +{ + unsigned int available; + + available = atomic_read(&sdp->sd_log_revokes_available); + while (available >= revokes) { + if (atomic_try_cmpxchg(&sdp->sd_log_revokes_available, + &available, available - revokes)) + return true; } + return false; +} - spin_unlock(&sdp->sd_ail_lock); +/** + * gfs2_log_release_revokes - Release a given number of revokes + * @sdp: The GFS2 superblock + * @revokes: The number of revokes to release + * + * sdp->sd_log_flush_lock must be held. + */ +void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes) +{ + if (revokes) + atomic_add(revokes, &sdp->sd_log_revokes_available); } /** @@ -400,86 +469,141 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) { - atomic_add(blks, &sdp->sd_log_blks_free); trace_gfs2_log_blocks(sdp, blks); gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); - up_read(&sdp->sd_log_flush_lock); + if (atomic_read(&sdp->sd_log_blks_needed)) + wake_up(&sdp->sd_log_waitq); } /** - * gfs2_log_reserve - Make a log reservation + * __gfs2_log_try_reserve - Try to make a log reservation + * @sdp: The GFS2 superblock + * @blks: The number of blocks to reserve + * @taboo_blks: The number of blocks to leave free + * + * Try to do the same as __gfs2_log_reserve(), but fail if no more log + * space is immediately available. + */ +static bool __gfs2_log_try_reserve(struct gfs2_sbd *sdp, unsigned int blks, + unsigned int taboo_blks) +{ + unsigned wanted = blks + taboo_blks; + unsigned int free_blocks; + + free_blocks = atomic_read(&sdp->sd_log_blks_free); + while (free_blocks >= wanted) { + if (atomic_try_cmpxchg(&sdp->sd_log_blks_free, &free_blocks, + free_blocks - blks)) { + trace_gfs2_log_blocks(sdp, -blks); + return true; + } + } + return false; +} + +/** + * __gfs2_log_reserve - Make a log reservation * @sdp: The GFS2 superblock * @blks: The number of blocks to reserve + * @taboo_blks: The number of blocks to leave free * - * Note that we never give out the last few blocks of the journal. Thats - * due to the fact that there is a small number of header blocks - * associated with each log flush. The exact number can't be known until - * flush time, so we ensure that we have just enough free blocks at all - * times to avoid running out during a log flush. + * @taboo_blks is set to 0 for logd, and to GFS2_LOG_FLUSH_MIN_BLOCKS + * for all other processes. This ensures that when the log is almost full, + * logd will still be able to call gfs2_log_flush one more time without + * blocking, which will advance the tail and make some more log space + * available. * * We no longer flush the log here, instead we wake up logd to do that * for us. To avoid the thundering herd and to ensure that we deal fairly * with queued waiters, we use an exclusive wait. This means that when we * get woken with enough journal space to get our reservation, we need to * wake the next waiter on the list. - * - * Returns: errno */ -int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) +static void __gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks, + unsigned int taboo_blks) { - int ret = 0; - unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize); - unsigned wanted = blks + reserved_blks; - DEFINE_WAIT(wait); - int did_wait = 0; + unsigned wanted = blks + taboo_blks; unsigned int free_blocks; - if (gfs2_assert_warn(sdp, blks) || - gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) - return -EINVAL; atomic_add(blks, &sdp->sd_log_blks_needed); -retry: - free_blocks = atomic_read(&sdp->sd_log_blks_free); - if (unlikely(free_blocks <= wanted)) { - do { - prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait, - TASK_UNINTERRUPTIBLE); + for (;;) { + if (current != sdp->sd_logd_process) wake_up(&sdp->sd_logd_waitq); - did_wait = 1; - if (atomic_read(&sdp->sd_log_blks_free) <= wanted) - io_schedule(); - free_blocks = atomic_read(&sdp->sd_log_blks_free); - } while(free_blocks <= wanted); - finish_wait(&sdp->sd_log_waitq, &wait); - } - atomic_inc(&sdp->sd_reserving_log); - if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks, - free_blocks - blks) != free_blocks) { - if (atomic_dec_and_test(&sdp->sd_reserving_log)) - wake_up(&sdp->sd_reserving_log_wait); - goto retry; + io_wait_event(sdp->sd_log_waitq, + (free_blocks = atomic_read(&sdp->sd_log_blks_free), + free_blocks >= wanted)); + do { + if (atomic_try_cmpxchg(&sdp->sd_log_blks_free, + &free_blocks, + free_blocks - blks)) + goto reserved; + } while (free_blocks >= wanted); } - atomic_sub(blks, &sdp->sd_log_blks_needed); - trace_gfs2_log_blocks(sdp, -blks); - /* - * If we waited, then so might others, wake them up _after_ we get - * our share of the log. - */ - if (unlikely(did_wait)) +reserved: + trace_gfs2_log_blocks(sdp, -blks); + if (atomic_sub_return(blks, &sdp->sd_log_blks_needed)) wake_up(&sdp->sd_log_waitq); +} + +/** + * gfs2_log_try_reserve - Try to make a log reservation + * @sdp: The GFS2 superblock + * @tr: The transaction + * @extra_revokes: The number of additional revokes reserved (output) + * + * This is similar to gfs2_log_reserve, but sdp->sd_log_flush_lock must be + * held for correct revoke accounting. + */ - down_read(&sdp->sd_log_flush_lock); - if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { - gfs2_log_release(sdp, blks); - ret = -EROFS; +bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes) +{ + unsigned int blks = tr->tr_reserved; + unsigned int revokes = tr->tr_revokes; + unsigned int revoke_blks = 0; + + *extra_revokes = 0; + if (revokes && !__gfs2_log_try_reserve_revokes(sdp, revokes)) { + revoke_blks = DIV_ROUND_UP(revokes, sdp->sd_inptrs); + *extra_revokes = revoke_blks * sdp->sd_inptrs - revokes; + blks += revoke_blks; } - if (atomic_dec_and_test(&sdp->sd_reserving_log)) - wake_up(&sdp->sd_reserving_log_wait); - return ret; + if (!blks) + return true; + if (__gfs2_log_try_reserve(sdp, blks, GFS2_LOG_FLUSH_MIN_BLOCKS)) + return true; + if (!revoke_blks) + gfs2_log_release_revokes(sdp, revokes); + return false; +} + +/** + * gfs2_log_reserve - Make a log reservation + * @sdp: The GFS2 superblock + * @tr: The transaction + * @extra_revokes: The number of additional revokes reserved (output) + * + * sdp->sd_log_flush_lock must not be held. + */ + +void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes) +{ + unsigned int blks = tr->tr_reserved; + unsigned int revokes = tr->tr_revokes; + unsigned int revoke_blks = 0; + + *extra_revokes = 0; + if (revokes) { + revoke_blks = DIV_ROUND_UP(revokes, sdp->sd_inptrs); + *extra_revokes = revoke_blks * sdp->sd_inptrs - revokes; + blks += revoke_blks; + } + __gfs2_log_reserve(sdp, blks, GFS2_LOG_FLUSH_MIN_BLOCKS); } /** @@ -507,24 +631,20 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer } /** - * calc_reserved - Calculate the number of blocks to reserve when - * refunding a transaction's unused buffers. + * calc_reserved - Calculate the number of blocks to keep reserved * @sdp: The GFS2 superblock * * This is complex. We need to reserve room for all our currently used - * metadata buffers (e.g. normal file I/O rewriting file time stamps) and - * all our journaled data buffers for journaled files (e.g. files in the + * metadata blocks (e.g. normal file I/O rewriting file time stamps) and + * all our journaled data blocks for journaled files (e.g. files in the * meta_fs like rindex, or files for which chattr +j was done.) - * If we don't reserve enough space, gfs2_log_refund and gfs2_log_flush - * will count it as free space (sd_log_blks_free) and corruption will follow. + * If we don't reserve enough space, corruption will follow. * - * We can have metadata bufs and jdata bufs in the same journal. So each - * type gets its own log header, for which we need to reserve a block. - * In fact, each type has the potential for needing more than one header - * in cases where we have more buffers than will fit on a journal page. + * We can have metadata blocks and jdata blocks in the same journal. Each + * type gets its own log descriptor, for which we need to reserve a block. + * In fact, each type has the potential for needing more than one log descriptor + * in cases where we have more blocks than will fit in a log descriptor. * Metadata journal entries take up half the space of journaled buffer entries. - * Thus, metadata entries have buf_limit (502) and journaled buffers have - * databuf_limit (251) before they cause a wrap around. * * Also, we need to reserve blocks for revoke journal entries and one for an * overall header for the lot. @@ -533,59 +653,29 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer */ static unsigned int calc_reserved(struct gfs2_sbd *sdp) { - unsigned int reserved = 0; - unsigned int mbuf; - unsigned int dbuf; + unsigned int reserved = GFS2_LOG_FLUSH_MIN_BLOCKS; + unsigned int blocks; struct gfs2_trans *tr = sdp->sd_log_tr; if (tr) { - mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm; - dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; - reserved = mbuf + dbuf; - /* Account for header blocks */ - reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp)); - reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp)); + blocks = tr->tr_num_buf_new - tr->tr_num_buf_rm; + reserved += blocks + DIV_ROUND_UP(blocks, buf_limit(sdp)); + blocks = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + reserved += blocks + DIV_ROUND_UP(blocks, databuf_limit(sdp)); } - - if (sdp->sd_log_committed_revoke > 0) - reserved += gfs2_struct2blk(sdp, sdp->sd_log_committed_revoke); - /* One for the overall header */ - if (reserved) - reserved++; return reserved; } -static unsigned int current_tail(struct gfs2_sbd *sdp) -{ - struct gfs2_trans *tr; - unsigned int tail; - - spin_lock(&sdp->sd_ail_lock); - - if (list_empty(&sdp->sd_ail1_list)) { - tail = sdp->sd_log_head; - } else { - tr = list_last_entry(&sdp->sd_ail1_list, struct gfs2_trans, - tr_list); - tail = tr->tr_first; - } - - spin_unlock(&sdp->sd_ail_lock); - - return tail; -} - -static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) +static void log_pull_tail(struct gfs2_sbd *sdp) { - unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); + unsigned int new_tail = sdp->sd_log_flush_tail; + unsigned int dist; + if (new_tail == sdp->sd_log_tail) + return; + dist = log_distance(sdp, new_tail, sdp->sd_log_tail); ail2_empty(sdp, new_tail); - - atomic_add(dist, &sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, dist); - gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= - sdp->sd_jdesc->jd_blocks); - + gfs2_log_release(sdp, dist); sdp->sd_log_tail = new_tail; } @@ -698,7 +788,7 @@ void gfs2_glock_remove_revoke(struct gfs2_glock *gl) } /** - * gfs2_write_revokes - Add as many revokes to the system transaction as we can + * gfs2_flush_revokes - Add as many revokes to the system transaction as we can * @sdp: The GFS2 superblock * * Our usual strategy is to defer writing revokes as much as we can in the hope @@ -709,38 +799,14 @@ void gfs2_glock_remove_revoke(struct gfs2_glock *gl) * been written back. This will basically come at no cost now, and will save * us from having to keep track of those blocks on the AIL2 list later. */ -void gfs2_write_revokes(struct gfs2_sbd *sdp) +void gfs2_flush_revokes(struct gfs2_sbd *sdp) { /* number of revokes we still have room for */ - int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); + unsigned int max_revokes = atomic_read(&sdp->sd_log_revokes_available); gfs2_log_lock(sdp); - while (sdp->sd_log_num_revoke > max_revokes) - max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); - max_revokes -= sdp->sd_log_num_revoke; - if (!sdp->sd_log_num_revoke) { - atomic_dec(&sdp->sd_log_blks_free); - /* If no blocks have been reserved, we need to also - * reserve a block for the header */ - if (!sdp->sd_log_blks_reserved) { - atomic_dec(&sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, -2); - } else { - trace_gfs2_log_blocks(sdp, -1); - } - } gfs2_ail1_empty(sdp, max_revokes); gfs2_log_unlock(sdp); - - if (!sdp->sd_log_num_revoke) { - atomic_inc(&sdp->sd_log_blks_free); - if (!sdp->sd_log_blks_reserved) { - atomic_inc(&sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, 2); - } else { - trace_gfs2_log_blocks(sdp, 1); - } - } } /** @@ -769,7 +835,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, u64 dblock; if (gfs2_withdrawn(sdp)) - goto out; + return; page = mempool_alloc(gfs2_page_pool, GFP_NOIO); lh = page_address(page); @@ -822,10 +888,8 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, sb->s_blocksize - LH_V1_SIZE - 4); lh->lh_crc = cpu_to_be32(crc); - gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); -out: - log_flush_wait(sdp); + gfs2_log_write(sdp, jd, page, sb->s_blocksize, 0, dblock); + gfs2_log_submit_bio(&jd->jd_log_bio, REQ_OP_WRITE | op_flags); } /** @@ -838,25 +902,24 @@ out: static void log_write_header(struct gfs2_sbd *sdp, u32 flags) { - unsigned int tail; int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); gfs2_assert_withdraw(sdp, (state != SFS_FROZEN)); - tail = current_tail(sdp); if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); log_flush_wait(sdp); op_flags = REQ_SYNC | REQ_META | REQ_PRIO; } - sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); - gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, tail, - sdp->sd_log_flush_head, flags, op_flags); + sdp->sd_log_idle = (sdp->sd_log_flush_tail == sdp->sd_log_flush_head); + gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, + sdp->sd_log_flush_tail, sdp->sd_log_flush_head, + flags, op_flags); gfs2_log_incr_head(sdp); - - if (sdp->sd_log_tail != tail) - log_pull_tail(sdp, tail); + log_flush_wait(sdp); + log_pull_tail(sdp); + gfs2_log_update_head(sdp); } /** @@ -956,10 +1019,15 @@ static void trans_drain(struct gfs2_trans *tr) void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) { struct gfs2_trans *tr = NULL; + unsigned int reserved_blocks = 0, used_blocks = 0; enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); + unsigned int first_log_head; + unsigned int reserved_revokes = 0; down_write(&sdp->sd_log_flush_lock); + trace_gfs2_log_flush(sdp, 1, flags); +repeat: /* * Do this check while holding the log_flush_lock to prevent new * buffers from being added to the ail via gfs2_pin() @@ -970,28 +1038,47 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) /* Log might have been flushed while we waited for the flush lock */ if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) goto out; - trace_gfs2_log_flush(sdp, 1, flags); - if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN) - clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + first_log_head = sdp->sd_log_head; + sdp->sd_log_flush_head = first_log_head; - sdp->sd_log_flush_head = sdp->sd_log_head; tr = sdp->sd_log_tr; - if (tr) { - sdp->sd_log_tr = NULL; - tr->tr_first = sdp->sd_log_flush_head; - if (unlikely (state == SFS_FROZEN)) - if (gfs2_assert_withdraw_delayed(sdp, - !tr->tr_num_buf_new && !tr->tr_num_databuf_new)) - goto out_withdraw; + if (tr || sdp->sd_log_num_revoke) { + if (reserved_blocks) + gfs2_log_release(sdp, reserved_blocks); + reserved_blocks = sdp->sd_log_blks_reserved; + reserved_revokes = sdp->sd_log_num_revoke; + if (tr) { + sdp->sd_log_tr = NULL; + tr->tr_first = first_log_head; + if (unlikely (state == SFS_FROZEN)) { + if (gfs2_assert_withdraw_delayed(sdp, + !tr->tr_num_buf_new && !tr->tr_num_databuf_new)) + goto out_withdraw; + } + } + } else if (!reserved_blocks) { + unsigned int taboo_blocks = GFS2_LOG_FLUSH_MIN_BLOCKS; + + reserved_blocks = GFS2_LOG_FLUSH_MIN_BLOCKS; + if (current == sdp->sd_logd_process) + taboo_blocks = 0; + + if (!__gfs2_log_try_reserve(sdp, reserved_blocks, taboo_blocks)) { + up_write(&sdp->sd_log_flush_lock); + __gfs2_log_reserve(sdp, reserved_blocks, taboo_blocks); + down_write(&sdp->sd_log_flush_lock); + goto repeat; + } + BUG_ON(sdp->sd_log_num_revoke); } + if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN) + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + if (unlikely(state == SFS_FROZEN)) - if (gfs2_assert_withdraw_delayed(sdp, !sdp->sd_log_num_revoke)) + if (gfs2_assert_withdraw_delayed(sdp, !reserved_revokes)) goto out_withdraw; - if (gfs2_assert_withdraw_delayed(sdp, - sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke)) - goto out_withdraw; gfs2_ordered_write(sdp); if (gfs2_withdrawn(sdp)) @@ -999,16 +1086,13 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) lops_before_commit(sdp, tr); if (gfs2_withdrawn(sdp)) goto out_withdraw; - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); + gfs2_log_submit_bio(&sdp->sd_jdesc->jd_log_bio, REQ_OP_WRITE); if (gfs2_withdrawn(sdp)) goto out_withdraw; if (sdp->sd_log_head != sdp->sd_log_flush_head) { - log_flush_wait(sdp); log_write_header(sdp, flags); - } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ - atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ - trace_gfs2_log_blocks(sdp, -1); + } else if (sdp->sd_log_tail != sdp->sd_log_flush_tail && !sdp->sd_log_idle) { log_write_header(sdp, flags); } if (gfs2_withdrawn(sdp)) @@ -1016,9 +1100,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) lops_after_commit(sdp, tr); gfs2_log_lock(sdp); - sdp->sd_log_head = sdp->sd_log_flush_head; sdp->sd_log_blks_reserved = 0; - sdp->sd_log_committed_revoke = 0; spin_lock(&sdp->sd_ail_lock); if (tr && !list_empty(&tr->tr_ail1_list)) { @@ -1033,10 +1115,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) empty_ail1_list(sdp); if (gfs2_withdrawn(sdp)) goto out_withdraw; - atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ - trace_gfs2_log_blocks(sdp, -1); log_write_header(sdp, flags); - sdp->sd_log_head = sdp->sd_log_flush_head; } if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN | GFS2_LOG_HEAD_FLUSH_FREEZE)) @@ -1046,12 +1125,22 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) } out_end: - trace_gfs2_log_flush(sdp, 0, flags); + used_blocks = log_distance(sdp, sdp->sd_log_flush_head, first_log_head); + reserved_revokes += atomic_read(&sdp->sd_log_revokes_available); + atomic_set(&sdp->sd_log_revokes_available, sdp->sd_ldptrs); + gfs2_assert_withdraw(sdp, reserved_revokes % sdp->sd_inptrs == sdp->sd_ldptrs); + if (reserved_revokes > sdp->sd_ldptrs) + reserved_blocks += (reserved_revokes - sdp->sd_ldptrs) / sdp->sd_inptrs; out: + if (used_blocks != reserved_blocks) { + gfs2_assert_withdraw_delayed(sdp, used_blocks < reserved_blocks); + gfs2_log_release(sdp, reserved_blocks - used_blocks); + } up_write(&sdp->sd_log_flush_lock); gfs2_trans_free(sdp, tr); if (gfs2_withdrawing(sdp)) gfs2_withdraw(sdp); + trace_gfs2_log_flush(sdp, 0, flags); return; out_withdraw: @@ -1087,8 +1176,8 @@ static void gfs2_merge_trans(struct gfs2_sbd *sdp, struct gfs2_trans *new) old->tr_num_databuf_new += new->tr_num_databuf_new; old->tr_num_buf_rm += new->tr_num_buf_rm; old->tr_num_databuf_rm += new->tr_num_databuf_rm; + old->tr_revokes += new->tr_revokes; old->tr_num_revoke += new->tr_num_revoke; - old->tr_num_revoke_rm += new->tr_num_revoke_rm; list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); list_splice_tail_init(&new->tr_buf, &old->tr_buf); @@ -1110,20 +1199,17 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) if (sdp->sd_log_tr) { gfs2_merge_trans(sdp, tr); } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { - gfs2_assert_withdraw(sdp, test_bit(TR_ALLOCED, &tr->tr_flags)); + gfs2_assert_withdraw(sdp, !test_bit(TR_ONSTACK, &tr->tr_flags)); sdp->sd_log_tr = tr; set_bit(TR_ATTACHED, &tr->tr_flags); } - sdp->sd_log_committed_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; reserved = calc_reserved(sdp); maxres = sdp->sd_log_blks_reserved + tr->tr_reserved; gfs2_assert_withdraw(sdp, maxres >= reserved); unused = maxres - reserved; - atomic_add(unused, &sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, unused); - gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= - sdp->sd_jdesc->jd_blocks); + if (unused) + gfs2_log_release(sdp, unused); sdp->sd_log_blks_reserved = reserved; gfs2_log_unlock(sdp); @@ -1166,15 +1252,11 @@ static void gfs2_log_shutdown(struct gfs2_sbd *sdp) gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); - sdp->sd_log_flush_head = sdp->sd_log_head; - log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT | GFS2_LFC_SHUTDOWN); + log_pull_tail(sdp); gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); - - sdp->sd_log_head = sdp->sd_log_flush_head; - sdp->sd_log_tail = sdp->sd_log_head; } static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) @@ -1208,7 +1290,6 @@ int gfs2_logd(void *data) struct gfs2_sbd *sdp = data; unsigned long t = 1; DEFINE_WAIT(wait); - bool did_flush; while (!kthread_should_stop()) { @@ -1227,12 +1308,10 @@ int gfs2_logd(void *data) continue; } - did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp, 0); gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | - GFS2_LFC_LOGD_JFLUSH_REQD); - did_flush = true; + GFS2_LFC_LOGD_JFLUSH_REQD); } if (gfs2_ail_flush_reqd(sdp)) { @@ -1240,13 +1319,9 @@ int gfs2_logd(void *data) gfs2_ail1_wait(sdp); gfs2_ail1_empty(sdp, 0); gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | - GFS2_LFC_LOGD_AIL_FLUSH_REQD); - did_flush = true; + GFS2_LFC_LOGD_AIL_FLUSH_REQD); } - if (!gfs2_ail_flush_reqd(sdp) || did_flush) - wake_up(&sdp->sd_log_waitq); - t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; try_to_freeze(); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 79f97290146e..eea58015710e 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -13,6 +13,13 @@ #include "incore.h" #include "inode.h" +/* + * The minimum amount of log space required for a log flush is one block for + * revokes and one block for the log header. Log flushes other than + * GFS2_LOG_HEAD_FLUSH_NORMAL may write one or two more log headers. + */ +#define GFS2_LOG_FLUSH_MIN_BLOCKS 4 + /** * gfs2_log_lock - acquire the right to mess with the log manager * @sdp: the filesystem @@ -43,7 +50,9 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, if (++value == sdp->sd_jdesc->jd_blocks) { value = 0; } - sdp->sd_log_head = sdp->sd_log_tail = value; + sdp->sd_log_tail = value; + sdp->sd_log_flush_tail = value; + sdp->sd_log_head = value; } static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) @@ -64,8 +73,13 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); +extern bool gfs2_log_is_empty(struct gfs2_sbd *sdp); +extern void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes); extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); -extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); +extern bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes); +extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes); extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, u64 seq, u32 tail, u32 lblock, u32 flags, int op_flags); @@ -78,6 +92,6 @@ extern void log_flush_wait(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl); -extern void gfs2_write_revokes(struct gfs2_sbd *sdp); +extern void gfs2_flush_revokes(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 3922b26264f5..dc1b93a877c6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -76,15 +76,20 @@ static void maybe_release_space(struct gfs2_bufdata *bd) unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; struct gfs2_bitmap *bi = rgd->rd_bits + index; + rgrp_lock_local(rgd); if (bi->bi_clone == NULL) - return; + goto out; if (sdp->sd_args.ar_discard) gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL); memcpy(bi->bi_clone + bi->bi_offset, bd->bd_bh->b_data + bi->bi_offset, bi->bi_bytes); clear_bit(GBF_FULL, &bi->bi_flags); rgd->rd_free_clone = rgd->rd_free; + BUG_ON(rgd->rd_free_clone < rgd->rd_reserved); rgd->rd_extfail_pt = rgd->rd_free; + +out: + rgrp_unlock_local(rgd); } /** @@ -322,17 +327,18 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, * then add the page segment to that. */ -void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, - unsigned size, unsigned offset, u64 blkno) +void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + struct page *page, unsigned size, unsigned offset, + u64 blkno) { struct bio *bio; int ret; - bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio, REQ_OP_WRITE, + bio = gfs2_log_get_bio(sdp, blkno, &jd->jd_log_bio, REQ_OP_WRITE, gfs2_end_log_write, false); ret = bio_add_page(bio, page, size, offset); if (ret == 0) { - bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio, + bio = gfs2_log_get_bio(sdp, blkno, &jd->jd_log_bio, REQ_OP_WRITE, gfs2_end_log_write, true); ret = bio_add_page(bio, page, size, offset); WARN_ON(ret == 0); @@ -355,7 +361,8 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); gfs2_log_incr_head(sdp); - gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh), dblock); + gfs2_log_write(sdp, sdp->sd_jdesc, bh->b_page, bh->b_size, + bh_offset(bh), dblock); } /** @@ -369,14 +376,14 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) * the page may be freed at any time. */ -void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) +static void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) { struct super_block *sb = sdp->sd_vfs; u64 dblock; dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); gfs2_log_incr_head(sdp); - gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); + gfs2_log_write(sdp, sdp->sd_jdesc, page, sb->s_blocksize, 0, dblock); } /** @@ -845,7 +852,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) struct page *page; unsigned int length; - gfs2_write_revokes(sdp); + gfs2_flush_revokes(sdp); if (!sdp->sd_log_num_revoke) return; @@ -857,7 +864,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) sdp->sd_log_num_revoke--; if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { - gfs2_log_write_page(sdp, page); page = mempool_alloc(gfs2_page_pool, GFP_NOIO); mh = page_address(page); diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index fbdbb08dcec6..31b6dd0d2e5d 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -10,37 +10,24 @@ #include <linux/list.h> #include "incore.h" -#define BUF_OFFSET \ - ((sizeof(struct gfs2_log_descriptor) + sizeof(__be64) - 1) & \ - ~(sizeof(__be64) - 1)) -#define DATABUF_OFFSET \ - ((sizeof(struct gfs2_log_descriptor) + (2 * sizeof(__be64) - 1)) & \ - ~(2 * sizeof(__be64) - 1)) - extern const struct gfs2_log_operations *gfs2_log_ops[]; extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); -extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, - unsigned size, unsigned offset, u64 blkno); -extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); +extern void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + struct page *page, unsigned size, unsigned offset, + u64 blkno); extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { - unsigned int limit; - - limit = (sdp->sd_sb.sb_bsize - BUF_OFFSET) / sizeof(__be64); - return limit; + return sdp->sd_ldptrs; } static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) { - unsigned int limit; - - limit = (sdp->sd_sb.sb_bsize - DATABUF_OFFSET) / (2 * sizeof(__be64)); - return limit; + return sdp->sd_ldptrs / 2; } static inline void lops_before_commit(struct gfs2_sbd *sdp, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index c7393ee9cf68..28d0eb23e18e 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -98,7 +98,7 @@ static int __init init_gfs2_fs(void) error = -ENOMEM; gfs2_glock_cachep = kmem_cache_create("gfs2_glock", sizeof(struct gfs2_glock), - 0, 0, + 0, SLAB_RECLAIM_ACCOUNT, gfs2_init_glock_once); if (!gfs2_glock_cachep) goto fail_cachep1; @@ -134,7 +134,7 @@ static int __init init_gfs2_fs(void) gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad", sizeof(struct gfs2_quota_data), - 0, 0, NULL); + 0, SLAB_RECLAIM_ACCOUNT, NULL); if (!gfs2_quotad_cachep) goto fail_cachep6; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 61fce59cb4d3..74c7d01723b9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -136,8 +136,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_rwsem(&sdp->sd_log_flush_lock); atomic_set(&sdp->sd_log_in_flight, 0); - atomic_set(&sdp->sd_reserving_log, 0); - init_waitqueue_head(&sdp->sd_reserving_log_wait); init_waitqueue_head(&sdp->sd_log_flush_wait); atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); mutex_init(&sdp->sd_freeze_mutex); @@ -171,7 +169,8 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) return -EINVAL; } - if (sb->sb_fs_format != GFS2_FORMAT_FS || + if (sb->sb_fs_format < GFS2_FS_FORMAT_MIN || + sb->sb_fs_format > GFS2_FS_FORMAT_MAX || sb->sb_multihost_format != GFS2_FORMAT_MULTI) { fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); return -EINVAL; @@ -179,7 +178,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) if (sb->sb_bsize < 512 || sb->sb_bsize > PAGE_SIZE || (sb->sb_bsize & (sb->sb_bsize - 1))) { - pr_warn("Invalid superblock size\n"); + pr_warn("Invalid block size\n"); return -EINVAL; } @@ -317,6 +316,13 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sizeof(struct gfs2_meta_header)) * GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */ + /* + * We always keep at least one block reserved for revokes in + * transactions. This greatly simplifies allocating additional + * revoke blocks. + */ + atomic_set(&sdp->sd_log_revokes_available, sdp->sd_ldptrs); + /* Compute maximum reservation required to add a entry to a directory */ hash_blocks = DIV_ROUND_UP(sizeof(u64) * BIT(GFS2_DIR_MAX_DEPTH), @@ -488,6 +494,19 @@ static int init_sb(struct gfs2_sbd *sdp, int silent) goto out; } + switch(sdp->sd_sb.sb_fs_format) { + case GFS2_FS_FORMAT_MAX: + sb->s_xattr = gfs2_xattr_handlers_max; + break; + + case GFS2_FS_FORMAT_MIN: + sb->s_xattr = gfs2_xattr_handlers_min; + break; + + default: + BUG(); + } + /* Set up the buffer cache and SB for real */ if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) { ret = -EINVAL; @@ -1032,13 +1051,14 @@ hostdata_error: } if (lm->lm_mount == NULL) { - fs_info(sdp, "Now mounting FS...\n"); + fs_info(sdp, "Now mounting FS (format %u)...\n", sdp->sd_sb.sb_fs_format); complete_all(&sdp->sd_locking_init); return 0; } ret = lm->lm_mount(sdp, table); if (ret == 0) - fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + fs_info(sdp, "Joined cluster. Now mounting FS (format %u)...\n", + sdp->sd_sb.sb_fs_format); complete_all(&sdp->sd_locking_init); return ret; } @@ -1084,6 +1104,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) int silent = fc->sb_flags & SB_SILENT; struct gfs2_sbd *sdp; struct gfs2_holder mount_gh; + struct gfs2_holder freeze_gh; int error; sdp = init_sbd(sb); @@ -1107,7 +1128,6 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &gfs2_super_ops; sb->s_d_op = &gfs2_dops; sb->s_export_op = &gfs2_export_ops; - sb->s_xattr = gfs2_xattr_handlers; sb->s_qcop = &gfs2_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; @@ -1156,6 +1176,10 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) if (error) goto fail_locking; + /* Turn rgrplvb on by default if fs format is recent enough */ + if (!sdp->sd_args.ar_got_rgrplvb && sdp->sd_sb.sb_fs_format > 1801) + sdp->sd_args.ar_rgrplvb = 1; + error = wait_on_journal(sdp); if (error) goto fail_sb; @@ -1195,25 +1219,18 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) goto fail_per_node; } - if (sb_rdonly(sb)) { - struct gfs2_holder freeze_gh; + error = gfs2_freeze_lock(sdp, &freeze_gh, 0); + if (error) + goto fail_per_node; - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, - &freeze_gh); - if (error) { - fs_err(sdp, "can't make FS RO: %d\n", error); - goto fail_per_node; - } - gfs2_glock_dq_uninit(&freeze_gh); - } else { + if (!sb_rdonly(sb)) error = gfs2_make_fs_rw(sdp); - if (error) { - fs_err(sdp, "can't make FS RW: %d\n", error); - goto fail_per_node; - } - } + gfs2_freeze_unlock(&freeze_gh); + if (error) { + fs_err(sdp, "can't make FS RW: %d\n", error); + goto fail_per_node; + } gfs2_glock_dq_uninit(&mount_gh); gfs2_online_uevent(sdp); return 0; @@ -1456,6 +1473,7 @@ static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_rgrplvb: args->ar_rgrplvb = result.boolean; + args->ar_got_rgrplvb = 1; break; case Opt_loccookie: args->ar_loccookie = result.boolean; @@ -1514,6 +1532,12 @@ static int gfs2_reconfigure(struct fs_context *fc) fc->sb_flags |= SB_RDONLY; if ((sb->s_flags ^ fc->sb_flags) & SB_RDONLY) { + struct gfs2_holder freeze_gh; + + error = gfs2_freeze_lock(sdp, &freeze_gh, 0); + if (error) + return -EINVAL; + if (fc->sb_flags & SB_RDONLY) { error = gfs2_make_fs_ro(sdp); if (error) @@ -1523,6 +1547,7 @@ static int gfs2_reconfigure(struct fs_context *fc) if (error) errorfc(fc, "unable to remount read-write"); } + gfs2_freeze_unlock(&freeze_gh); } sdp->sd_args = *newargs; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index c26c68ebd29d..282173774005 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -470,9 +470,7 @@ void gfs2_recover_func(struct work_struct *work) /* Acquire a shared hold on the freeze lock */ - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | LM_FLAG_PRIORITY | - GL_EXACT, &thaw_gh); + error = gfs2_freeze_lock(sdp, &thaw_gh, LM_FLAG_PRIORITY); if (error) goto fail_gunlock_ji; @@ -507,22 +505,24 @@ void gfs2_recover_func(struct work_struct *work) /* We take the sd_log_flush_lock here primarily to prevent log * flushes and simultaneous journal replays from stomping on - * each other wrt sd_log_bio. */ + * each other wrt jd_log_bio. */ down_read(&sdp->sd_log_flush_lock); for (pass = 0; pass < 2; pass++) { lops_before_scan(jd, &head, pass); error = foreach_descriptor(jd, head.lh_tail, head.lh_blkno, pass); lops_after_scan(jd, error, pass); - if (error) + if (error) { + up_read(&sdp->sd_log_flush_lock); goto fail_gunlock_thaw; + } } recover_local_statfs(jd, &head); clean_journal(jd, &head); up_read(&sdp->sd_log_flush_lock); - gfs2_glock_dq_uninit(&thaw_gh); + gfs2_freeze_unlock(&thaw_gh); t_rep = ktime_get(); fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, " "jhead:%lldms, tlck:%lldms, replay:%lldms]\n", @@ -544,7 +544,7 @@ void gfs2_recover_func(struct work_struct *work) goto done; fail_gunlock_thaw: - gfs2_glock_dq_uninit(&thaw_gh); + gfs2_freeze_unlock(&thaw_gh); fail_gunlock_ji: if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5e8eef9990e3..89c37a845e64 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -36,6 +36,24 @@ #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) +struct gfs2_rbm { + struct gfs2_rgrpd *rgd; + u32 offset; /* The offset is bitmap relative */ + int bii; /* Bitmap index */ +}; + +static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm) +{ + return rbm->rgd->rd_bits + rbm->bii; +} + +static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) +{ + BUG_ON(rbm->offset >= rbm->rgd->rd_data); + return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + + rbm->offset; +} + /* * These routines are used by the resource group routines (rgrp.c) * to keep track of block allocation. Each block is represented by two @@ -61,7 +79,7 @@ static const char valid_change[16] = { }; static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, - const struct gfs2_inode *ip, bool nowrap); + struct gfs2_blkreserv *rs, bool nowrap); /** @@ -175,7 +193,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) /** * rs_cmp - multi-block reservation range compare - * @blk: absolute file system block number of the new reservation + * @start: start of the new reservation * @len: number of blocks in the new reservation * @rs: existing reservation to compare against * @@ -183,13 +201,11 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) * -1 if the block range is before the start of the reservation * 0 if the block range overlaps with the reservation */ -static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs) +static inline int rs_cmp(u64 start, u32 len, struct gfs2_blkreserv *rs) { - u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm); - - if (blk >= startblk + rs->rs_free) + if (start >= rs->rs_start + rs->rs_requested) return 1; - if (blk + len - 1 < startblk) + if (rs->rs_start >= start + len) return -1; return 0; } @@ -277,29 +293,38 @@ static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) } /** - * gfs2_rbm_incr - increment an rbm structure + * gfs2_rbm_add - add a number of blocks to an rbm * @rbm: The rbm with rgd already set correctly + * @blocks: The number of blocks to add to rpm * - * This function takes an existing rbm structure and increments it to the next - * viable block offset. - * - * Returns: If incrementing the offset would cause the rbm to go past the - * end of the rgrp, true is returned, otherwise false. + * This function takes an existing rbm structure and adds a number of blocks to + * it. * + * Returns: True if the new rbm would point past the end of the rgrp. */ -static bool gfs2_rbm_incr(struct gfs2_rbm *rbm) +static bool gfs2_rbm_add(struct gfs2_rbm *rbm, u32 blocks) { - if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */ - rbm->offset++; + struct gfs2_rgrpd *rgd = rbm->rgd; + struct gfs2_bitmap *bi = rgd->rd_bits + rbm->bii; + + if (rbm->offset + blocks < bi->bi_blocks) { + rbm->offset += blocks; return false; } - if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */ - return true; + blocks -= bi->bi_blocks - rbm->offset; - rbm->offset = 0; - rbm->bii++; - return false; + for(;;) { + bi++; + if (bi == rgd->rd_bits + rgd->rd_length) + return true; + if (blocks < bi->bi_blocks) { + rbm->offset = blocks; + rbm->bii = bi - rgd->rd_bits; + return false; + } + blocks -= bi->bi_blocks; + } } /** @@ -308,7 +333,8 @@ static bool gfs2_rbm_incr(struct gfs2_rbm *rbm) * @n_unaligned: Number of unaligned blocks to check * @len: Decremented for each block found (terminate on zero) * - * Returns: true if a non-free block is encountered + * Returns: true if a non-free block is encountered or the end of the resource + * group is reached. */ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len) @@ -323,7 +349,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le (*len)--; if (*len == 0) return true; - if (gfs2_rbm_incr(rbm)) + if (gfs2_rbm_add(rbm, 1)) return true; } @@ -595,10 +621,11 @@ static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs, { struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res); - gfs2_print_dbg(seq, "%s B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf, + gfs2_print_dbg(seq, "%s B: n:%llu s:%llu f:%u\n", + fs_id_buf, (unsigned long long)ip->i_no_addr, - (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm), - rs->rs_rbm.offset, rs->rs_free); + (unsigned long long)rs->rs_start, + rs->rs_requested); } /** @@ -613,33 +640,22 @@ static void __rs_deltree(struct gfs2_blkreserv *rs) if (!gfs2_rs_active(rs)) return; - rgd = rs->rs_rbm.rgd; + rgd = rs->rs_rgd; trace_gfs2_rs(rs, TRACE_RS_TREEDEL); rb_erase(&rs->rs_node, &rgd->rd_rstree); RB_CLEAR_NODE(&rs->rs_node); - if (rs->rs_free) { - u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) + - rs->rs_free - 1; - struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, }; - struct gfs2_bitmap *start, *last; + if (rs->rs_requested) { + /* return requested blocks to the rgrp */ + BUG_ON(rs->rs_rgd->rd_requested < rs->rs_requested); + rs->rs_rgd->rd_requested -= rs->rs_requested; - /* return reserved blocks to the rgrp */ - BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); - rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; /* The rgrp extent failure point is likely not to increase; it will only do so if the freed blocks are somehow contiguous with a span of free blocks that follows. Still, it will force the number to be recalculated later. */ - rgd->rd_extfail_pt += rs->rs_free; - rs->rs_free = 0; - if (gfs2_rbm_from_block(&last_rbm, last_block)) - return; - start = rbm_bi(&rs->rs_rbm); - last = rbm_bi(&last_rbm); - do - clear_bit(GBF_FULL, &start->bi_flags); - while (start++ != last); + rgd->rd_extfail_pt += rs->rs_requested; + rs->rs_requested = 0; } } @@ -652,11 +668,11 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) { struct gfs2_rgrpd *rgd; - rgd = rs->rs_rbm.rgd; + rgd = rs->rs_rgd; if (rgd) { spin_lock(&rgd->rd_rsspin); __rs_deltree(rs); - BUG_ON(rs->rs_free); + BUG_ON(rs->rs_requested); spin_unlock(&rgd->rd_rsspin); } } @@ -904,6 +920,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) rgd->rd_data = be32_to_cpu(buf.ri_data); rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); spin_lock_init(&rgd->rd_rsspin); + mutex_init(&rgd->rd_mutex); error = compute_bitstructs(rgd); if (error) @@ -1149,6 +1166,23 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd) return count; } +static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd) +{ + struct gfs2_bitmap *bi; + int x; + + if (rgd->rd_free) { + for (x = 0; x < rgd->rd_length; x++) { + bi = rgd->rd_bits + x; + clear_bit(GBF_FULL, &bi->bi_flags); + } + } else { + for (x = 0; x < rgd->rd_length; x++) { + bi = rgd->rd_bits + x; + set_bit(GBF_FULL, &bi->bi_flags); + } + } +} /** * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps @@ -1192,11 +1226,11 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) } if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) { - for (x = 0; x < length; x++) - clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); + rgrp_set_bitmap_flags(rgd); rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); rgd->rd_free_clone = rgd->rd_free; + BUG_ON(rgd->rd_reserved); /* max out the rgrp allocation failure point */ rgd->rd_extfail_pt = rgd->rd_free; } @@ -1244,7 +1278,11 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd) if (rgd->rd_rgl->rl_unlinked == 0) rgd->rd_flags &= ~GFS2_RDF_CHECK; rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free); + rgrp_set_bitmap_flags(rgd); rgd->rd_free_clone = rgd->rd_free; + BUG_ON(rgd->rd_reserved); + /* max out the rgrp allocation failure point */ + rgd->rd_extfail_pt = rgd->rd_free; rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes); rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration); return 0; @@ -1404,7 +1442,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp) while (1) { - ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &gh); if (ret) goto out; @@ -1412,9 +1451,11 @@ int gfs2_fitrim(struct file *filp, void __user *argp) /* Trim each bitmap in the rgrp */ for (x = 0; x < rgd->rd_length; x++) { struct gfs2_bitmap *bi = rgd->rd_bits + x; + rgrp_lock_local(rgd); ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, minlen, &amt); + rgrp_unlock_local(rgd); if (ret) { gfs2_glock_dq_uninit(&gh); goto out; @@ -1426,9 +1467,11 @@ int gfs2_fitrim(struct file *filp, void __user *argp) ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0); if (ret == 0) { bh = rgd->rd_bits[0].bi_bh; + rgrp_lock_local(rgd); rgd->rd_flags |= GFS2_RGF_TRIMMED; gfs2_trans_add_meta(rgd->rd_gl, bh); gfs2_rgrp_out(rgd, bh->b_data); + rgrp_unlock_local(rgd); gfs2_trans_end(sdp); } } @@ -1458,8 +1501,7 @@ static void rs_insert(struct gfs2_inode *ip) struct rb_node **newn, *parent = NULL; int rc; struct gfs2_blkreserv *rs = &ip->i_res; - struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd; - u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm); + struct gfs2_rgrpd *rgd = rs->rs_rgd; BUG_ON(gfs2_rs_active(rs)); @@ -1470,7 +1512,7 @@ static void rs_insert(struct gfs2_inode *ip) rb_entry(*newn, struct gfs2_blkreserv, rs_node); parent = *newn; - rc = rs_cmp(fsblock, rs->rs_free, cur); + rc = rs_cmp(rs->rs_start, rs->rs_requested, cur); if (rc > 0) newn = &((*newn)->rb_right); else if (rc < 0) @@ -1486,7 +1528,7 @@ static void rs_insert(struct gfs2_inode *ip) rb_insert_color(&rs->rs_node, &rgd->rd_rstree); /* Do our rgrp accounting for the reservation */ - rgd->rd_reserved += rs->rs_free; /* blocks reserved */ + rgd->rd_requested += rs->rs_requested; /* blocks requested */ spin_unlock(&rgd->rd_rsspin); trace_gfs2_rs(rs, TRACE_RS_INSERT); } @@ -1507,9 +1549,9 @@ static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs) { u32 tot_reserved, tot_free; - if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free)) + if (WARN_ON_ONCE(rgd->rd_requested < rs->rs_requested)) return 0; - tot_reserved = rgd->rd_reserved - rs->rs_free; + tot_reserved = rgd->rd_requested - rs->rs_requested; if (rgd->rd_free_clone < tot_reserved) tot_reserved = 0; @@ -1534,17 +1576,26 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, u64 goal; struct gfs2_blkreserv *rs = &ip->i_res; u32 extlen; - u32 free_blocks = rgd_free(rgd, rs); + u32 free_blocks, blocks_available; int ret; struct inode *inode = &ip->i_inode; + spin_lock(&rgd->rd_rsspin); + free_blocks = rgd_free(rgd, rs); + if (rgd->rd_free_clone < rgd->rd_requested) + free_blocks = 0; + blocks_available = rgd->rd_free_clone - rgd->rd_reserved; + if (rgd == rs->rs_rgd) + blocks_available += rs->rs_reserved; + spin_unlock(&rgd->rd_rsspin); + if (S_ISDIR(inode->i_mode)) extlen = 1; else { extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target); extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks); } - if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) + if (free_blocks < extlen || blocks_available < extlen) return; /* Find bitmap block that contains bits for goal block */ @@ -1556,10 +1607,10 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) return; - ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true); + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, &ip->i_res, true); if (ret == 0) { - rs->rs_rbm = rbm; - rs->rs_free = extlen; + rs->rs_start = gfs2_rbm_to_block(&rbm); + rs->rs_requested = extlen; rs_insert(ip); } else { if (goal == rgd->rd_last_alloc + rgd->rd_data0) @@ -1572,7 +1623,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, * @rgd: The resource group * @block: The starting block * @length: The required length - * @ip: Ignore any reservations for this inode + * @ignore_rs: Reservation to ignore * * If the block does not appear in any reservation, then return the * block number unchanged. If it does appear in the reservation, then @@ -1582,7 +1633,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, u32 length, - const struct gfs2_inode *ip) + struct gfs2_blkreserv *ignore_rs) { struct gfs2_blkreserv *rs; struct rb_node *n; @@ -1602,8 +1653,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, } if (n) { - while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) { - block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free; + while (rs_cmp(block, length, rs) == 0 && rs != ignore_rs) { + block = rs->rs_start + rs->rs_requested; n = n->rb_right; if (n == NULL) break; @@ -1618,7 +1669,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, /** * gfs2_reservation_check_and_update - Check for reservations during block alloc * @rbm: The current position in the resource group - * @ip: The inode for which we are searching for blocks + * @rs: Our own reservation * @minext: The minimum extent length * @maxext: A pointer to the maximum extent structure * @@ -1632,20 +1683,19 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, */ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, - const struct gfs2_inode *ip, + struct gfs2_blkreserv *rs, u32 minext, struct gfs2_extent *maxext) { u64 block = gfs2_rbm_to_block(rbm); u32 extlen = 1; u64 nblock; - int ret; /* * If we have a minimum extent length, then skip over any extent * which is less than the min extent length in size. */ - if (minext) { + if (minext > 1) { extlen = gfs2_free_extlen(rbm, minext); if (extlen <= maxext->len) goto fail; @@ -1655,7 +1705,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, * Check the extent which has been found against the reservations * and skip if parts of it are already reserved */ - nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); + nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, rs); if (nblock == block) { if (!minext || extlen >= minext) return 0; @@ -1664,12 +1714,15 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, maxext->len = extlen; maxext->rbm = *rbm; } -fail: - nblock = block + extlen; + } else { + u64 len = nblock - block; + if (len >= (u64)1 << 32) + return -E2BIG; + extlen = len; } - ret = gfs2_rbm_from_block(rbm, nblock); - if (ret < 0) - return ret; +fail: + if (gfs2_rbm_add(rbm, extlen)) + return -E2BIG; return 1; } @@ -1677,9 +1730,9 @@ fail: * gfs2_rbm_find - Look for blocks of a particular state * @rbm: Value/result starting position and final position * @state: The state which we want to find - * @minext: Pointer to the requested extent length (NULL for a single block) + * @minext: Pointer to the requested extent length * This is updated to be the actual reservation size. - * @ip: If set, check for reservations + * @rs: Our own reservation (NULL to skip checking for reservations) * @nowrap: Stop looking at the end of the rgrp, rather than wrapping * around until we've reached the starting point. * @@ -1693,7 +1746,7 @@ fail: */ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, - const struct gfs2_inode *ip, bool nowrap) + struct gfs2_blkreserv *rs, bool nowrap) { bool scan_from_start = rbm->bii == 0 && rbm->offset == 0; struct buffer_head *bh; @@ -1714,8 +1767,7 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, while(1) { bi = rbm_bi(rbm); - if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) && - test_bit(GBF_FULL, &bi->bi_flags) && + if (test_bit(GBF_FULL, &bi->bi_flags) && (state == GFS2_BLKST_FREE)) goto next_bitmap; @@ -1731,11 +1783,10 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, goto next_bitmap; } rbm->offset = offset; - if (ip == NULL) + if (!rs) return 0; - ret = gfs2_reservation_check_and_update(rbm, ip, - minext ? *minext : 0, + ret = gfs2_reservation_check_and_update(rbm, rs, *minext, &maxext); if (ret == 0) return 0; @@ -1767,7 +1818,7 @@ next_iter: break; } - if (minext == NULL || state != GFS2_BLKST_FREE) + if (state != GFS2_BLKST_FREE) return -ENOSPC; /* If the extent was too small, and it's smaller than the smallest @@ -1775,7 +1826,7 @@ next_iter: useless to search this rgrp again for this amount or more. */ if (wrapped && (scan_from_start || rbm->bii > last_bii) && *minext < rbm->rgd->rd_extfail_pt) - rbm->rgd->rd_extfail_pt = *minext; + rbm->rgd->rd_extfail_pt = *minext - 1; /* If the maximum extent we found is big enough to fulfill the minimum requirements, use it anyway. */ @@ -1938,7 +1989,7 @@ static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs, u64 tdiff; tdiff = ktime_to_ns(ktime_sub(ktime_get_real(), - rs->rs_rbm.rgd->rd_gl->gl_dstamp)); + rs->rs_rgd->rd_gl->gl_dstamp)); return tdiff > (msecs * 1000 * 1000); } @@ -1993,8 +2044,7 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd) * We try our best to find an rgrp that has at least ap->target blocks * available. After a couple of passes (loops == 2), the prospects of finding * such an rgrp diminish. At this stage, we return the first rgrp that has - * at least ap->min_target blocks available. Either way, we set ap->allowed to - * the number of blocks available in the chosen rgrp. + * at least ap->min_target blocks available. * * Returns: 0 on success, * -ENOMEM if a suitable rgrp can't be found @@ -2006,56 +2056,64 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; struct gfs2_blkreserv *rs = &ip->i_res; - int error = 0, rg_locked, flags = 0; + int error = 0, flags = LM_FLAG_NODE_SCOPE; + bool rg_locked; u64 last_unlinked = NO_BLOCK; + u32 target = ap->target; int loops = 0; - u32 free_blocks, skip = 0; + u32 free_blocks, blocks_available, skip = 0; + + BUG_ON(rs->rs_reserved); if (sdp->sd_args.ar_rgrplvb) flags |= GL_SKIP; - if (gfs2_assert_warn(sdp, ap->target)) + if (gfs2_assert_warn(sdp, target)) return -EINVAL; if (gfs2_rs_active(rs)) { - begin = rs->rs_rbm.rgd; - } else if (rs->rs_rbm.rgd && - rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) { - begin = rs->rs_rbm.rgd; + begin = rs->rs_rgd; + } else if (rs->rs_rgd && + rgrp_contains_block(rs->rs_rgd, ip->i_goal)) { + begin = rs->rs_rgd; } else { check_and_update_goal(ip); - rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); + rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); } if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) skip = gfs2_orlov_skip(ip); - if (rs->rs_rbm.rgd == NULL) + if (rs->rs_rgd == NULL) return -EBADSLT; while (loops < 3) { - rg_locked = 1; + struct gfs2_rgrpd *rgd; - if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) { - rg_locked = 0; + rg_locked = gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl); + if (rg_locked) { + rgrp_lock_local(rs->rs_rgd); + } else { if (skip && skip--) goto next_rgrp; if (!gfs2_rs_active(rs)) { if (loops == 0 && - !fast_to_acquire(rs->rs_rbm.rgd)) + !fast_to_acquire(rs->rs_rgd)) goto next_rgrp; if ((loops < 2) && gfs2_rgrp_used_recently(rs, 1000) && - gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + gfs2_rgrp_congested(rs->rs_rgd, loops)) goto next_rgrp; } - error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, + error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl, LM_ST_EXCLUSIVE, flags, &ip->i_rgd_gh); if (unlikely(error)) return error; + rgrp_lock_local(rs->rs_rgd); if (!gfs2_rs_active(rs) && (loops < 2) && - gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + gfs2_rgrp_congested(rs->rs_rgd, loops)) goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) { - error = update_rgrp_lvb(rs->rs_rbm.rgd); + error = update_rgrp_lvb(rs->rs_rgd); if (unlikely(error)) { + rgrp_unlock_local(rs->rs_rgd); gfs2_glock_dq_uninit(&ip->i_rgd_gh); return error; } @@ -2063,36 +2121,46 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) } /* Skip unusable resource groups */ - if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | + if ((rs->rs_rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) || - (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt)) + (loops == 0 && target > rs->rs_rgd->rd_extfail_pt)) goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) - gfs2_rgrp_bh_get(rs->rs_rbm.rgd); + gfs2_rgrp_bh_get(rs->rs_rgd); /* Get a reservation if we don't already have one */ if (!gfs2_rs_active(rs)) - rg_mblk_search(rs->rs_rbm.rgd, ip, ap); + rg_mblk_search(rs->rs_rgd, ip, ap); /* Skip rgrps when we can't get a reservation on first pass */ if (!gfs2_rs_active(rs) && (loops < 1)) goto check_rgrp; /* If rgrp has enough free space, use it */ - free_blocks = rgd_free(rs->rs_rbm.rgd, rs); - if (free_blocks >= ap->target || - (loops == 2 && ap->min_target && - free_blocks >= ap->min_target)) { - ap->allowed = free_blocks; - return 0; + rgd = rs->rs_rgd; + spin_lock(&rgd->rd_rsspin); + free_blocks = rgd_free(rgd, rs); + blocks_available = rgd->rd_free_clone - rgd->rd_reserved; + if (free_blocks < target || blocks_available < target) { + spin_unlock(&rgd->rd_rsspin); + goto check_rgrp; } + rs->rs_reserved = ap->target; + if (rs->rs_reserved > blocks_available) + rs->rs_reserved = blocks_available; + rgd->rd_reserved += rs->rs_reserved; + spin_unlock(&rgd->rd_rsspin); + rgrp_unlock_local(rs->rs_rgd); + return 0; check_rgrp: /* Check for unlinked inodes which can be reclaimed */ - if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) - try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, + if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rs->rs_rgd, &last_unlinked, ip->i_no_addr); skip_rgrp: + rgrp_unlock_local(rs->rs_rgd); + /* Drop reservation, if we couldn't use reserved rgrp */ if (gfs2_rs_active(rs)) gfs2_rs_deltree(rs); @@ -2102,7 +2170,7 @@ skip_rgrp: gfs2_glock_dq_uninit(&ip->i_rgd_gh); next_rgrp: /* Find the next rgrp, and continue looking */ - if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin)) + if (gfs2_select_rgrp(&rs->rs_rgd, begin)) continue; if (skip) continue; @@ -2119,9 +2187,12 @@ next_rgrp: return error; } /* Flushing the log may release space */ - if (loops == 2) + if (loops == 2) { + if (ap->min_target) + target = ap->min_target; gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_INPLACE_RESERVE); + } } return -ENOSPC; @@ -2136,6 +2207,17 @@ next_rgrp: void gfs2_inplace_release(struct gfs2_inode *ip) { + struct gfs2_blkreserv *rs = &ip->i_res; + + if (rs->rs_reserved) { + struct gfs2_rgrpd *rgd = rs->rs_rgd; + + spin_lock(&rgd->rd_rsspin); + BUG_ON(rgd->rd_reserved < rs->rs_reserved); + rgd->rd_reserved -= rs->rs_reserved; + spin_unlock(&rgd->rd_rsspin); + rs->rs_reserved = 0; + } if (gfs2_holder_initialized(&ip->i_rgd_gh)) gfs2_glock_dq_uninit(&ip->i_rgd_gh); } @@ -2205,7 +2287,7 @@ static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd, bi_prev = bi; } gfs2_setbit(&rbm, false, new_state); - gfs2_rbm_incr(&rbm); + gfs2_rbm_add(&rbm, 1); } } @@ -2223,11 +2305,12 @@ void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *trs; const struct rb_node *n; - gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n", + spin_lock(&rgd->rd_rsspin); + gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u q:%u r:%u e:%u\n", fs_id_buf, (unsigned long long)rgd->rd_addr, rgd->rd_flags, rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, - rgd->rd_reserved, rgd->rd_extfail_pt); + rgd->rd_requested, rgd->rd_reserved, rgd->rd_extfail_pt); if (rgd->rd_sbd->sd_args.ar_rgrplvb) { struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; @@ -2236,7 +2319,6 @@ void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, be32_to_cpu(rgl->rl_free), be32_to_cpu(rgl->rl_dinodes)); } - spin_lock(&rgd->rd_rsspin); for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { trs = rb_entry(n, struct gfs2_blkreserv, rs_node); dump_rs(seq, trs, fs_id_buf); @@ -2273,29 +2355,29 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip, { struct gfs2_blkreserv *rs = &ip->i_res; struct gfs2_rgrpd *rgd = rbm->rgd; - unsigned rlen; - u64 block; - int ret; - spin_lock(&rgd->rd_rsspin); + BUG_ON(rs->rs_reserved < len); + rs->rs_reserved -= len; if (gfs2_rs_active(rs)) { - if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) { - block = gfs2_rbm_to_block(rbm); - ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len); - rlen = min(rs->rs_free, len); - rs->rs_free -= rlen; - rgd->rd_reserved -= rlen; + u64 start = gfs2_rbm_to_block(rbm); + + if (rs->rs_start == start) { + unsigned int rlen; + + rs->rs_start += len; + rlen = min(rs->rs_requested, len); + rs->rs_requested -= rlen; + rgd->rd_requested -= rlen; trace_gfs2_rs(rs, TRACE_RS_CLAIM); - if (rs->rs_free && !ret) - goto out; + if (rs->rs_start < rgd->rd_data0 + rgd->rd_data && + rs->rs_requested) + return; /* We used up our block reservation, so we should reserve more blocks next time. */ atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint); } __rs_deltree(rs); } -out: - spin_unlock(&rgd->rd_rsspin); } /** @@ -2315,15 +2397,13 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, u64 goal; if (gfs2_rs_active(&ip->i_res)) { - *rbm = ip->i_res.rs_rbm; - return; + goal = ip->i_res.rs_start; + } else { + if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal)) + goal = ip->i_goal; + else + goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0; } - - if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal)) - goal = ip->i_goal; - else - goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0; - if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) { rbm->bii = 0; rbm->offset = 0; @@ -2346,17 +2426,21 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; - struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, }; - unsigned int ndata; + struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rgd, }; u64 block; /* block, within the file system scope */ - int error; + u32 minext = 1; + int error = -ENOSPC; - gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false); + BUG_ON(ip->i_res.rs_reserved < *nblocks); + rgrp_lock_local(rbm.rgd); + if (gfs2_rs_active(&ip->i_res)) { + gfs2_set_alloc_start(&rbm, ip, dinode); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, &ip->i_res, false); + } if (error == -ENOSPC) { gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, NULL, false); } /* Since all blocks are reserved in advance, this shouldn't happen */ @@ -2371,14 +2455,8 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_alloc_extent(&rbm, dinode, nblocks); block = gfs2_rbm_to_block(&rbm); rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0; - if (gfs2_rs_active(&ip->i_res)) - gfs2_adjust_reservation(ip, &rbm, *nblocks); - ndata = *nblocks; - if (dinode) - ndata--; - if (!dinode) { - ip->i_goal = block + ndata - 1; + ip->i_goal = block + *nblocks - 1; error = gfs2_meta_inode_buffer(ip, &dibh); if (error == 0) { struct gfs2_dinode *di = @@ -2389,12 +2467,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, brelse(dibh); } } - if (rbm.rgd->rd_free < *nblocks) { + spin_lock(&rbm.rgd->rd_rsspin); + gfs2_adjust_reservation(ip, &rbm, *nblocks); + if (rbm.rgd->rd_free < *nblocks || rbm.rgd->rd_reserved < *nblocks) { fs_warn(sdp, "nblocks=%u\n", *nblocks); + spin_unlock(&rbm.rgd->rd_rsspin); goto rgrp_error; } - + BUG_ON(rbm.rgd->rd_reserved < *nblocks); + BUG_ON(rbm.rgd->rd_free_clone < *nblocks); + BUG_ON(rbm.rgd->rd_free < *nblocks); + rbm.rgd->rd_reserved -= *nblocks; + rbm.rgd->rd_free_clone -= *nblocks; rbm.rgd->rd_free -= *nblocks; + spin_unlock(&rbm.rgd->rd_rsspin); if (dinode) { rbm.rgd->rd_dinodes++; *generation = rbm.rgd->rd_igeneration++; @@ -2404,6 +2490,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); + rgrp_unlock_local(rbm.rgd); gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); if (dinode) @@ -2411,13 +2498,13 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); - rbm.rgd->rd_free_clone -= *nblocks; trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); *bn = block; return 0; rgrp_error: + rgrp_unlock_local(rbm.rgd); gfs2_rgrp_error(rbm.rgd); return -EIO; } @@ -2437,12 +2524,14 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + rgrp_lock_local(rgd); rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE); trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); rgd->rd_free += blen; rgd->rd_flags &= ~GFS2_RGF_TRIMMED; gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + rgrp_unlock_local(rgd); /* Directories keep their data in the metadata address space */ if (meta || ip->i_depth || gfs2_is_jdata(ip)) @@ -2478,17 +2567,20 @@ void gfs2_unlink_di(struct inode *inode) rgd = gfs2_blk2rgrpd(sdp, blkno, true); if (!rgd) return; + rgrp_lock_local(rgd); rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED); trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1); + rgrp_unlock_local(rgd); } void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) { struct gfs2_sbd *sdp = rgd->rd_sbd; + rgrp_lock_local(rgd); rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); if (!rgd->rd_dinodes) gfs2_consist_rgrpd(rgd); @@ -2497,6 +2589,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + rgrp_unlock_local(rgd); be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1); gfs2_statfs_change(sdp, 0, +1, -1); @@ -2511,6 +2604,10 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) * @no_addr: The block number to check * @type: The block type we are looking for * + * The inode glock of @no_addr must be held. The @type to check for is either + * GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; checking for type GFS2_BLKST_FREE + * or GFS2_BLKST_USED would make no sense. + * * Returns: 0 if the block type matches the expected type * -ESTALE if it doesn't match * or -ve errno if something went wrong while checking @@ -2534,6 +2631,13 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) rbm.rgd = rgd; error = gfs2_rbm_from_block(&rbm, no_addr); if (!WARN_ON_ONCE(error)) { + /* + * No need to take the local resource group lock here; the + * inode glock of @no_addr provides the necessary + * synchronization in case the block is an inode. (In case + * the block is not an inode, the block type will not match + * the @type we are looking for.) + */ if (gfs2_testbit(&rbm, false) != type) error = -ESTALE; } @@ -2578,7 +2682,7 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, return; rgd = gfs2_blk2rgrpd(sdp, block, 1); } else { - rgd = ip->i_res.rs_rbm.rgd; + rgd = ip->i_res.rs_rgd; if (!rgd || !rgrp_contains_block(rgd, block)) rgd = gfs2_blk2rgrpd(sdp, block, 1); } @@ -2633,9 +2737,8 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist) sizeof(struct gfs2_holder), GFP_NOFS | __GFP_NOFAIL); for (x = 0; x < rlist->rl_rgrps; x++) - gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, - LM_ST_EXCLUSIVE, 0, - &rlist->rl_ghs[x]); + gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &rlist->rl_ghs[x]); } /** @@ -2658,3 +2761,14 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist) } } +void rgrp_lock_local(struct gfs2_rgrpd *rgd) +{ + BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl) && + !test_bit(SDF_NORECOVERY, &rgd->rd_sbd->sd_flags)); + mutex_lock(&rgd->rd_mutex); +} + +void rgrp_unlock_local(struct gfs2_rgrpd *rgd) +{ + mutex_unlock(&rgd->rd_mutex); +} diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 9a587ada51ed..a6855fd796e0 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -77,7 +77,7 @@ extern int gfs2_fitrim(struct file *filp, void __user *argp); /* This is how to tell if a reservation is in the rgrp tree: */ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs) { - return rs && !RB_EMPTY_NODE(&rs->rs_node); + return !RB_EMPTY_NODE(&rs->rs_node); } static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) @@ -88,4 +88,8 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) } extern void check_and_update_goal(struct gfs2_inode *ip); + +extern void rgrp_lock_local(struct gfs2_rgrpd *rgd); +extern void rgrp_unlock_local(struct gfs2_rgrpd *rgd); + #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 042b94288ff1..861ed5fe02a5 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -81,19 +81,12 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp) static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid) { struct gfs2_jdesc *jd; - int found = 0; list_for_each_entry(jd, head, jd_list) { - if (jd->jd_jid == jid) { - found = 1; - break; - } + if (jd->jd_jid == jid) + return jd; } - - if (!found) - jd = NULL; - - return jd; + return NULL; } struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid) @@ -165,7 +158,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); struct gfs2_glock *j_gl = ip->i_gl; - struct gfs2_holder freeze_gh; struct gfs2_log_header_host head; int error; @@ -173,12 +165,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) if (error) return error; - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, - &freeze_gh); - if (error) - goto fail_threads; - j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); if (gfs2_withdrawn(sdp)) { error = -EIO; @@ -205,13 +191,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - gfs2_glock_dq_uninit(&freeze_gh); - return 0; fail: - gfs2_glock_dq_uninit(&freeze_gh); -fail_threads: if (sdp->sd_quotad_process) kthread_stop(sdp->sd_quotad_process); sdp->sd_quotad_process = NULL; @@ -452,7 +434,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) } if (error) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + gfs2_freeze_unlock(&sdp->sd_freeze_gh); out: while (!list_empty(&list)) { @@ -607,30 +589,9 @@ out: int gfs2_make_fs_ro(struct gfs2_sbd *sdp) { - struct gfs2_holder freeze_gh; int error = 0; int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - gfs2_holder_mark_uninitialized(&freeze_gh); - if (sdp->sd_freeze_gl && - !gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) { - if (!log_write_allowed) { - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, - LM_ST_SHARED, LM_FLAG_TRY | - LM_FLAG_NOEXP | GL_EXACT, - &freeze_gh); - if (error == GLR_TRYFAILED) - error = 0; - } else { - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, - LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, - &freeze_gh); - if (error && !gfs2_withdrawn(sdp)) - return error; - } - } - gfs2_flush_delete_work(sdp); if (!log_write_allowed && current == sdp->sd_quotad_process) fs_warn(sdp, "The quotad daemon is withdrawing.\n"); @@ -650,18 +611,15 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN | GFS2_LFC_MAKE_FS_RO); - wait_event(sdp->sd_reserving_log_wait, - atomic_read(&sdp->sd_reserving_log) == 0); - gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == - sdp->sd_jdesc->jd_blocks); + wait_event_timeout(sdp->sd_log_waitq, + gfs2_log_is_empty(sdp), + HZ * 5); + gfs2_assert_warn(sdp, gfs2_log_is_empty(sdp)); } else { - wait_event_timeout(sdp->sd_reserving_log_wait, - atomic_read(&sdp->sd_reserving_log) == 0, + wait_event_timeout(sdp->sd_log_waitq, + gfs2_log_is_empty(sdp), HZ * 5); } - if (gfs2_holder_initialized(&freeze_gh)) - gfs2_glock_dq_uninit(&freeze_gh); - gfs2_quota_cleanup(sdp); if (!log_write_allowed) @@ -770,10 +728,8 @@ void gfs2_freeze_func(struct work_struct *work) struct super_block *sb = sdp->sd_vfs; atomic_inc(&sb->s_active); - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, &freeze_gh); + error = gfs2_freeze_lock(sdp, &freeze_gh, 0); if (error) { - fs_info(sdp, "GFS2: couldn't get freeze lock : %d\n", error); gfs2_assert_withdraw(sdp, 0); } else { atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); @@ -783,7 +739,7 @@ void gfs2_freeze_func(struct work_struct *work) error); gfs2_assert_withdraw(sdp, 0); } - gfs2_glock_dq_uninit(&freeze_gh); + gfs2_freeze_unlock(&freeze_gh); } deactivate_super(sb); clear_bit_unlock(SDF_FS_FROZEN, &sdp->sd_flags); @@ -851,7 +807,7 @@ static int gfs2_unfreeze(struct super_block *sb) return 0; } - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + gfs2_freeze_unlock(&sdp->sd_freeze_gh); mutex_unlock(&sdp->sd_freeze_mutex); return wait_on_bit(&sdp->sd_flags, SDF_FS_FROZEN, TASK_INTERRUPTIBLE); } @@ -1227,7 +1183,8 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) goto out_qs; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &gh); if (error) goto out_qs; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index c9fb2a654181..08e502dec7ec 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -11,6 +11,10 @@ #include <linux/dcache.h> #include "incore.h" +/* Supported fs format version range */ +#define GFS2_FS_FORMAT_MIN (1801) +#define GFS2_FS_FORMAT_MAX (1802) + extern void gfs2_lm_unmount(struct gfs2_sbd *sdp); static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) @@ -54,7 +58,9 @@ extern struct file_system_type gfs2meta_fs_type; extern const struct export_operations gfs2_export_ops; extern const struct super_operations gfs2_super_ops; extern const struct dentry_operations gfs2_dops; -extern const struct xattr_handler *gfs2_xattr_handlers[]; + +extern const struct xattr_handler *gfs2_xattr_handlers_max[]; +extern const struct xattr_handler **gfs2_xattr_handlers_min; #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 0b2f858d9a8c..bd6c8e9e49db 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -560,6 +560,7 @@ TRACE_EVENT(gfs2_block_alloc, __field( u8, block_state ) __field( u64, rd_addr ) __field( u32, rd_free_clone ) + __field( u32, rd_requested ) __field( u32, rd_reserved ) ), @@ -571,17 +572,20 @@ TRACE_EVENT(gfs2_block_alloc, __entry->block_state = block_state; __entry->rd_addr = rgd->rd_addr; __entry->rd_free_clone = rgd->rd_free_clone; + __entry->rd_requested = rgd->rd_requested; __entry->rd_reserved = rgd->rd_reserved; ), - TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu", + TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rq:%u rr:%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->inum, (unsigned long long)__entry->start, (unsigned long)__entry->len, block_state_name(__entry->block_state), (unsigned long long)__entry->rd_addr, - __entry->rd_free_clone, (unsigned long)__entry->rd_reserved) + __entry->rd_free_clone, + __entry->rd_requested, + __entry->rd_reserved) ); /* Keep track of multi-block reservations as they are allocated/freed */ @@ -595,33 +599,40 @@ TRACE_EVENT(gfs2_rs, __field( dev_t, dev ) __field( u64, rd_addr ) __field( u32, rd_free_clone ) + __field( u32, rd_requested ) __field( u32, rd_reserved ) __field( u64, inum ) __field( u64, start ) - __field( u32, free ) + __field( u32, requested ) + __field( u32, reserved ) __field( u8, func ) ), TP_fast_assign( - __entry->dev = rs->rs_rbm.rgd->rd_sbd->sd_vfs->s_dev; - __entry->rd_addr = rs->rs_rbm.rgd->rd_addr; - __entry->rd_free_clone = rs->rs_rbm.rgd->rd_free_clone; - __entry->rd_reserved = rs->rs_rbm.rgd->rd_reserved; + __entry->dev = rs->rs_rgd->rd_sbd->sd_vfs->s_dev; + __entry->rd_addr = rs->rs_rgd->rd_addr; + __entry->rd_free_clone = rs->rs_rgd->rd_free_clone; + __entry->rd_requested = rs->rs_rgd->rd_requested; + __entry->rd_reserved = rs->rs_rgd->rd_reserved; __entry->inum = container_of(rs, struct gfs2_inode, i_res)->i_no_addr; - __entry->start = gfs2_rbm_to_block(&rs->rs_rbm); - __entry->free = rs->rs_free; + __entry->start = rs->rs_start; + __entry->requested = rs->rs_requested; + __entry->reserved = rs->rs_reserved; __entry->func = func; ), - TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu", + TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%u rq:%u rr:%u %s q:%u r:%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->inum, (unsigned long long)__entry->start, (unsigned long long)__entry->rd_addr, - (unsigned long)__entry->rd_free_clone, - (unsigned long)__entry->rd_reserved, - rs_func_name(__entry->func), (unsigned long)__entry->free) + __entry->rd_free_clone, + __entry->rd_requested, + __entry->rd_reserved, + rs_func_name(__entry->func), + __entry->requested, + __entry->reserved) ); #endif /* _TRACE_GFS2_H */ diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 6d4bf7ea7b3b..ab96cf0bf26b 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -31,17 +31,17 @@ static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr) fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n", tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, test_bit(TR_TOUCHED, &tr->tr_flags)); - fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u/%u\n", + fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u\n", tr->tr_num_buf_new, tr->tr_num_buf_rm, tr->tr_num_databuf_new, tr->tr_num_databuf_rm, - tr->tr_num_revoke, tr->tr_num_revoke_rm); + tr->tr_num_revoke); } -int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, - unsigned int revokes) +int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp, + unsigned int blocks, unsigned int revokes, + unsigned long ip) { - struct gfs2_trans *tr; - int error; + unsigned int extra_revokes; if (current->journal_info) { gfs2_print_trans(sdp, current->journal_info); @@ -52,39 +52,72 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) return -EROFS; - tr = kmem_cache_zalloc(gfs2_trans_cachep, GFP_NOFS); - if (!tr) - return -ENOMEM; - - tr->tr_ip = _RET_IP_; + tr->tr_ip = ip; tr->tr_blocks = blocks; tr->tr_revokes = revokes; - tr->tr_reserved = 1; - set_bit(TR_ALLOCED, &tr->tr_flags); - if (blocks) - tr->tr_reserved += 6 + blocks; - if (revokes) - tr->tr_reserved += gfs2_struct2blk(sdp, revokes); + tr->tr_reserved = GFS2_LOG_FLUSH_MIN_BLOCKS; + if (blocks) { + /* + * The reserved blocks are either used for data or metadata. + * We can have mixed data and metadata, each with its own log + * descriptor block; see calc_reserved(). + */ + tr->tr_reserved += blocks + 1 + DIV_ROUND_UP(blocks - 1, databuf_limit(sdp)); + } INIT_LIST_HEAD(&tr->tr_databuf); INIT_LIST_HEAD(&tr->tr_buf); INIT_LIST_HEAD(&tr->tr_list); INIT_LIST_HEAD(&tr->tr_ail1_list); INIT_LIST_HEAD(&tr->tr_ail2_list); + if (gfs2_assert_warn(sdp, tr->tr_reserved <= sdp->sd_jdesc->jd_blocks)) + return -EINVAL; + sb_start_intwrite(sdp->sd_vfs); - error = gfs2_log_reserve(sdp, tr->tr_reserved); - if (error) - goto fail; + /* + * Try the reservations under sd_log_flush_lock to prevent log flushes + * from creating inconsistencies between the number of allocated and + * reserved revokes. If that fails, do a full-block allocation outside + * of the lock to avoid stalling log flushes. Then, allot the + * appropriate number of blocks to revokes, use as many revokes locally + * as needed, and "release" the surplus into the revokes pool. + */ + + down_read(&sdp->sd_log_flush_lock); + if (gfs2_log_try_reserve(sdp, tr, &extra_revokes)) + goto reserved; + up_read(&sdp->sd_log_flush_lock); + gfs2_log_reserve(sdp, tr, &extra_revokes); + down_read(&sdp->sd_log_flush_lock); + +reserved: + gfs2_log_release_revokes(sdp, extra_revokes); + if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { + gfs2_log_release_revokes(sdp, tr->tr_revokes); + up_read(&sdp->sd_log_flush_lock); + gfs2_log_release(sdp, tr->tr_reserved); + sb_end_intwrite(sdp->sd_vfs); + return -EROFS; + } current->journal_info = tr; return 0; +} -fail: - sb_end_intwrite(sdp->sd_vfs); - kmem_cache_free(gfs2_trans_cachep, tr); +int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes) +{ + struct gfs2_trans *tr; + int error; + tr = kmem_cache_zalloc(gfs2_trans_cachep, GFP_NOFS); + if (!tr) + return -ENOMEM; + error = __gfs2_trans_begin(tr, sdp, blocks, revokes, _RET_IP_); + if (error) + kmem_cache_free(gfs2_trans_cachep, tr); return error; } @@ -92,37 +125,39 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) { struct gfs2_trans *tr = current->journal_info; s64 nbuf; - int alloced = test_bit(TR_ALLOCED, &tr->tr_flags); current->journal_info = NULL; if (!test_bit(TR_TOUCHED, &tr->tr_flags)) { + gfs2_log_release_revokes(sdp, tr->tr_revokes); + up_read(&sdp->sd_log_flush_lock); gfs2_log_release(sdp, tr->tr_reserved); - if (alloced) { + if (!test_bit(TR_ONSTACK, &tr->tr_flags)) gfs2_trans_free(sdp, tr); - sb_end_intwrite(sdp->sd_vfs); - } + sb_end_intwrite(sdp->sd_vfs); return; } + gfs2_log_release_revokes(sdp, tr->tr_revokes - tr->tr_num_revoke); + nbuf = tr->tr_num_buf_new + tr->tr_num_databuf_new; nbuf -= tr->tr_num_buf_rm; nbuf -= tr->tr_num_databuf_rm; - if (gfs2_assert_withdraw(sdp, (nbuf <= tr->tr_blocks) && - (tr->tr_num_revoke <= tr->tr_revokes))) + if (gfs2_assert_withdraw(sdp, nbuf <= tr->tr_blocks) || + gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) gfs2_print_trans(sdp, tr); gfs2_log_commit(sdp, tr); - if (alloced && !test_bit(TR_ATTACHED, &tr->tr_flags)) + if (!test_bit(TR_ONSTACK, &tr->tr_flags) && + !test_bit(TR_ATTACHED, &tr->tr_flags)) gfs2_trans_free(sdp, tr); up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS) gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_TRANS_END); - if (alloced) - sb_end_intwrite(sdp->sd_vfs); + sb_end_intwrite(sdp->sd_vfs); } static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, @@ -262,7 +297,6 @@ void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) { struct gfs2_bufdata *bd, *tmp; - struct gfs2_trans *tr = current->journal_info; unsigned int n = len; gfs2_log_lock(sdp); @@ -274,7 +308,7 @@ void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) if (bd->bd_gl) gfs2_glock_remove_revoke(bd->bd_gl); kmem_cache_free(gfs2_bufdata_cachep, bd); - tr->tr_num_revoke_rm++; + gfs2_log_release_revokes(sdp, 1); if (--n == 0) break; } diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index 83199ce5a5c5..c76ad9a4c75a 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -27,13 +27,16 @@ struct gfs2_glock; * block, or all of the blocks in the rg, whichever is smaller */ static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned requested) { - struct gfs2_rgrpd *rgd = ip->i_res.rs_rbm.rgd; + struct gfs2_rgrpd *rgd = ip->i_res.rs_rgd; if (requested < rgd->rd_length) return requested + 1; return rgd->rd_length; } +extern int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp, + unsigned int blocks, unsigned int revokes, + unsigned long ip); extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index a374397f4273..8d3c670c990f 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -91,12 +91,39 @@ out_unlock: return error; } +/** + * gfs2_freeze_lock - hold the freeze glock + * @sdp: the superblock + * @freeze_gh: pointer to the requested holder + * @caller_flags: any additional flags needed by the caller + */ +int gfs2_freeze_lock(struct gfs2_sbd *sdp, struct gfs2_holder *freeze_gh, + int caller_flags) +{ + int flags = LM_FLAG_NOEXP | GL_EXACT | caller_flags; + int error; + + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, flags, + freeze_gh); + if (error && error != GLR_TRYFAILED) + fs_err(sdp, "can't lock the freeze lock: %d\n", error); + return error; +} + +void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh) +{ + if (gfs2_holder_initialized(freeze_gh)) + gfs2_glock_dq_uninit(freeze_gh); +} + static void signal_our_withdraw(struct gfs2_sbd *sdp) { - struct gfs2_glock *gl = sdp->sd_live_gh.gh_gl; + struct gfs2_glock *live_gl = sdp->sd_live_gh.gh_gl; struct inode *inode = sdp->sd_jdesc->jd_inode; struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *i_gl = ip->i_gl; u64 no_formal_ino = ip->i_no_formal_ino; + int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); int ret = 0; int tries; @@ -117,8 +144,21 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) * therefore we need to clear SDF_JOURNAL_LIVE manually. */ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - if (!sb_rdonly(sdp->sd_vfs)) - ret = gfs2_make_fs_ro(sdp); + if (!sb_rdonly(sdp->sd_vfs)) { + struct gfs2_holder freeze_gh; + + gfs2_holder_mark_uninitialized(&freeze_gh); + if (sdp->sd_freeze_gl && + !gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) { + ret = gfs2_freeze_lock(sdp, &freeze_gh, + log_write_allowed ? 0 : LM_FLAG_TRY); + if (ret == GLR_TRYFAILED) + ret = 0; + } + if (!ret) + ret = gfs2_make_fs_ro(sdp); + gfs2_freeze_unlock(&freeze_gh); + } if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */ if (!ret) @@ -141,7 +181,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); thaw_super(sdp->sd_vfs); } else { - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); + wait_on_bit(&i_gl->gl_flags, GLF_DEMOTE, + TASK_UNINTERRUPTIBLE); } /* @@ -161,15 +202,15 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) * on other nodes to be successful, otherwise we remain the owner of * the glock as far as dlm is concerned. */ - if (gl->gl_ops->go_free) { - set_bit(GLF_FREEING, &gl->gl_flags); - wait_on_bit(&gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE); + if (i_gl->gl_ops->go_free) { + set_bit(GLF_FREEING, &i_gl->gl_flags); + wait_on_bit(&i_gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE); } /* * Dequeue the "live" glock, but keep a reference so it's never freed. */ - gfs2_glock_hold(gl); + gfs2_glock_hold(live_gl); gfs2_glock_dq_wait(&sdp->sd_live_gh); /* * We enqueue the "live" glock in EX so that all other nodes @@ -208,7 +249,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) gfs2_glock_nq(&sdp->sd_live_gh); } - gfs2_glock_queue_put(gl); /* drop the extra reference we acquired */ + gfs2_glock_queue_put(live_gl); /* drop extra reference we acquired */ clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); /* diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index a4443dd8a94b..69e1a0ae5a4d 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -149,6 +149,9 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, bool verbose); +extern int gfs2_freeze_lock(struct gfs2_sbd *sdp, + struct gfs2_holder *freeze_gh, int caller_flags); +extern void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh); #define gfs2_io_error(sdp) \ gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 9d7667bc4292..124b3d5a7266 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -70,6 +70,20 @@ static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize) return 0; } +static bool gfs2_eatype_valid(struct gfs2_sbd *sdp, u8 type) +{ + switch(sdp->sd_sb.sb_fs_format) { + case GFS2_FS_FORMAT_MAX: + return true; + + case GFS2_FS_FORMAT_MIN: + return type <= GFS2_EATYPE_SECURITY; + + default: + return false; + } +} + typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private); @@ -77,6 +91,7 @@ typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh, static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh, ea_call_t ea_call, void *data) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_header *ea, *prev = NULL; int error = 0; @@ -89,9 +104,8 @@ static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh, if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <= bh->b_data + bh->b_size)) goto fail; - if (!GFS2_EATYPE_VALID(ea->ea_type)) + if (!gfs2_eatype_valid(sdp, ea->ea_type)) goto fail; - error = ea_call(ip, bh, ea, prev, data); if (error) return error; @@ -259,7 +273,8 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, return -EIO; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &rg_gh); if (error) return error; @@ -344,6 +359,7 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct ea_list *ei = private; struct gfs2_ea_request *er = ei->ei_er; unsigned int ea_size; @@ -353,6 +369,8 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, if (ea->ea_type == GFS2_EATYPE_UNUSED) return 0; + BUG_ON(ea->ea_type > GFS2_EATYPE_SECURITY && + sdp->sd_sb.sb_fs_format == GFS2_FS_FORMAT_MIN); switch (ea->ea_type) { case GFS2_EATYPE_USR: prefix = "user."; @@ -366,8 +384,12 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, prefix = "security."; l = 9; break; + case GFS2_EATYPE_TRUSTED: + prefix = "trusted."; + l = 8; + break; default: - BUG(); + return 0; } ea_size = l + ea->ea_name_len + 1; @@ -1214,6 +1236,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, } static int gfs2_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -1385,7 +1408,8 @@ static int ea_dealloc_block(struct gfs2_inode *ip) return -EIO; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &gh); if (error) return error; @@ -1463,7 +1487,25 @@ static const struct xattr_handler gfs2_xattr_security_handler = { .set = gfs2_xattr_set, }; -const struct xattr_handler *gfs2_xattr_handlers[] = { +static bool +gfs2_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static const struct xattr_handler gfs2_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = GFS2_EATYPE_TRUSTED, + .list = gfs2_xattr_trusted_list, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, +}; + +const struct xattr_handler *gfs2_xattr_handlers_max[] = { + /* GFS2_FS_FORMAT_MAX */ + &gfs2_xattr_trusted_handler, + + /* GFS2_FS_FORMAT_MIN */ &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, &posix_acl_access_xattr_handler, @@ -1471,3 +1513,4 @@ const struct xattr_handler *gfs2_xattr_handlers[] = { NULL, }; +const struct xattr_handler **gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1; diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 74fa62643136..2bd54efaf416 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -121,6 +121,7 @@ static int hfs_xattr_get(const struct xattr_handler *handler, } static int hfs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 3bf2ae0e467c..527f6e46cbe8 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -189,8 +189,8 @@ static int hfs_dir_release(struct inode *inode, struct file *file) * a directory and return a corresponding inode, given the inode for * the directory and the name (and its length) of the new file. */ -static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int hfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; int res; @@ -219,7 +219,8 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ -static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int hfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; int res; @@ -279,9 +280,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) * new file/directory. * XXX: how do you handle must_be dir? */ -static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int hfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { int res; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index f71c384064c8..b8eb0322a3e5 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -204,7 +204,8 @@ extern const struct address_space_operations hfs_btree_aops; extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); -extern int hfs_inode_setattr(struct dentry *, struct iattr *); +extern int hfs_inode_setattr(struct user_namespace *, struct dentry *, + struct iattr *); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 log_size, __be32 phys_size, u32 clump_size); extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index f35a37c65e5f..3fc5cb346586 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -602,13 +602,15 @@ static int hfs_file_release(struct inode *inode, struct file *file) * correspond to the same HFS file. */ -int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) +int hfs_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); int error; - error = setattr_prepare(dentry, attr); /* basic permission checks */ + error = setattr_prepare(&init_user_ns, dentry, + attr); /* basic permission checks */ if (error) return error; @@ -647,7 +649,7 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) current_time(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 29a9dcfbe81f..03e6c046faf4 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -434,8 +434,8 @@ out: return res; } -static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int hfsplus_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; @@ -476,8 +476,8 @@ out: return res; } -static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int hfsplus_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; @@ -517,18 +517,20 @@ out: return res; } -static int hfsplus_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int hfsplus_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return hfsplus_mknod(dir, dentry, mode, 0); + return hfsplus_mknod(&init_user_ns, dir, dentry, mode, 0); } -static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int hfsplus_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0); + return hfsplus_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); } -static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, +static int hfsplus_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a92de5199ec3..12b20479ed2b 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -488,8 +488,9 @@ void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork); int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd); int hfsplus_cat_write_inode(struct inode *inode); -int hfsplus_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags); +int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index ca464328b79c..078c5c8a5156 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -241,12 +241,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) return 0; } -static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) +static int hfsplus_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -264,14 +265,15 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) inode->i_mtime = inode->i_ctime = current_time(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } -int hfsplus_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); @@ -286,7 +288,7 @@ int hfsplus_getattr(const struct path *path, struct kstat *stat, stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP; - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } @@ -376,7 +378,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, return NULL; inode->i_ino = sbi->next_cnid++; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index ce15b9496b77..3edb1926d127 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -91,7 +91,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) if (err) goto out; - if (!inode_owner_or_capable(inode)) { + if (!inode_owner_or_capable(&init_user_ns, inode)) { err = -EACCES; goto out_drop_write; } diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index bb0b27d88e50..4d169c5a2673 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -858,6 +858,7 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler, } static int hfsplus_osx_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index cfbe6a3bfb1e..c1c7a16cbf21 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -23,6 +23,7 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler, } static int hfsplus_security_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index fbad91e1dada..e150372ec564 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -22,6 +22,7 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, } static int hfsplus_trusted_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index 74d19faf255e..a6b60b153916 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -22,6 +22,7 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler, } static int hfsplus_user_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 4a5beca6eaa8..29e407762626 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -557,8 +557,8 @@ static int read_name(struct inode *ino, char *name) return 0; } -static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int hostfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; char *name; @@ -656,8 +656,8 @@ static int hostfs_unlink(struct inode *ino, struct dentry *dentry) return err; } -static int hostfs_symlink(struct inode *ino, struct dentry *dentry, - const char *to) +static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino, + struct dentry *dentry, const char *to) { char *file; int err; @@ -669,7 +669,8 @@ static int hostfs_symlink(struct inode *ino, struct dentry *dentry, return err; } -static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) +static int hostfs_mkdir(struct user_namespace *mnt_userns, struct inode *ino, + struct dentry *dentry, umode_t mode) { char *file; int err; @@ -693,7 +694,8 @@ static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) return err; } -static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; char *name; @@ -731,7 +733,8 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return err; } -static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry, +static int hostfs_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -759,7 +762,8 @@ static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry, return err; } -static int hostfs_permission(struct inode *ino, int desired) +static int hostfs_permission(struct user_namespace *mnt_userns, + struct inode *ino, int desired) { char *name; int r = 0, w = 0, x = 0, err; @@ -781,11 +785,12 @@ static int hostfs_permission(struct inode *ino, int desired) err = access_file(name, r, w, x); __putname(name); if (!err) - err = generic_permission(ino, desired); + err = generic_permission(&init_user_ns, ino, desired); return err; } -static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) +static int hostfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hostfs_iattr attrs; @@ -794,7 +799,7 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) int fd = HOSTFS_I(inode)->fd; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; @@ -851,7 +856,7 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size != i_size_read(inode)) truncate_setsize(inode, attr->ia_size); - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 1cca83218fb5..167ec6884642 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -280,7 +280,7 @@ void hpfs_init_inode(struct inode *); void hpfs_read_inode(struct inode *); void hpfs_write_inode(struct inode *); void hpfs_write_inode_nolock(struct inode *); -int hpfs_setattr(struct dentry *, struct iattr *); +int hpfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); void hpfs_write_if_changed(struct inode *); void hpfs_evict_inode(struct inode *); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index eb8b4baf0f2e..82208cc28ebd 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -257,7 +257,8 @@ void hpfs_write_inode_nolock(struct inode *i) brelse(bh); } -int hpfs_setattr(struct dentry *dentry, struct iattr *attr) +int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); int error = -EINVAL; @@ -274,7 +275,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) goto out_unlock; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) goto out_unlock; @@ -288,7 +289,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) hpfs_truncate(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); hpfs_write_inode(inode); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 1aee39160ac5..d73f8a67168e 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -20,7 +20,8 @@ static void hpfs_update_directory_times(struct inode *dir) hpfs_write_inode_nolock(dir); } -static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int hpfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -128,7 +129,8 @@ bail: return err; } -static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +static int hpfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -215,7 +217,8 @@ bail: return err; } -static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +static int hpfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -289,7 +292,8 @@ bail: return err; } -static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink) +static int hpfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symlink) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -506,10 +510,10 @@ fail: const struct address_space_operations hpfs_symlink_aops = { .readpage = hpfs_symlink_readpage }; - -static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) + +static int hpfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { const unsigned char *old_name = old_dentry->d_name.name; unsigned old_len = old_dentry->d_name.len; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 21c20fd5f9ee..701c82c36138 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -171,7 +171,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); ret = -ENOMEM; - if (hugetlb_reserve_pages(inode, + if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, vma->vm_flags)) @@ -310,7 +310,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, /* * Support for read() - Find the page attached to f_mapping and copy out the - * data. Its *very* similar to do_generic_mapping_read(), we can't use that + * data. Its *very* similar to generic_file_buffered_read(), we can't use that * since it has PAGE_SIZE assumptions. */ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -442,15 +442,15 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) * * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. - * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv + * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve * maps and global counts. Page faults can not race with truncation * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents * page faults in the truncated range by checking i_size. i_size is * modified while holding i_mmap_rwsem. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. - * Only when releasing a page is the associated region/reserv map - * deleted. The region/reserv map for ranges without associated + * Only when releasing a page is the associated region/reserve map + * deleted. The region/reserve map for ranges without associated * pages are not modified. Page faults can race with hole punch. * This is indicated if we find a mapped page. * Note: If the passed end of range value is beyond the end of file, but @@ -567,7 +567,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) clear_inode(inode); } -static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) +static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; struct address_space *mapping = inode->i_mapping; @@ -582,7 +582,6 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); - return 0; } static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) @@ -604,7 +603,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); - /* protected by i_mutex */ + /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; @@ -680,7 +679,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, */ struct page *page; unsigned long addr; - int avoid_reserve = 0; cond_resched(); @@ -716,8 +714,15 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, continue; } - /* Allocate page and add to page cache */ - page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve); + /* + * Allocate page without setting the avoid_reserve argument. + * There certainly are no reserves associated with the + * pseudo_vma. However, there could be shared mappings with + * reserves for the file at the inode level. If we fallocate + * pages in these areas, we need to consume the reserves + * to keep reservation accounting consistent. + */ + page = alloc_huge_page(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(page)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -735,7 +740,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_unlock(&hugetlb_fault_mutex_table[hash]); - set_page_huge_active(page); + SetHPageMigratable(page); /* * unlock_page because locked by add_to_page_cache() * put_page() due to reference from alloc_huge_page() @@ -752,7 +757,8 @@ out: return error; } -static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) +static int hugetlbfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hstate *h = hstate_inode(inode); @@ -760,9 +766,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); - BUG_ON(!inode); - - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -772,16 +776,14 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) if (newsize & ~huge_page_mask(h)) return -EINVAL; - /* protected by i_mutex */ + /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; - error = hugetlb_vmtruncate(inode, newsize); - if (error) - return error; + hugetlb_vmtruncate(inode, newsize); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } @@ -837,7 +839,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode->i_ino = get_next_ino(); - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; @@ -899,33 +901,39 @@ static int do_hugetlbfs_mknod(struct inode *dir, return error; } -static int hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) +static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { return do_hugetlbfs_mknod(dir, dentry, mode, dev, false); } -static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { - int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); + int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry, + mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } -static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +static int hugetlbfs_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) { - return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); + return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); } -static int hugetlbfs_tmpfile(struct inode *dir, - struct dentry *dentry, umode_t mode) +static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t mode) { return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true); } -static int hugetlbfs_symlink(struct inode *dir, - struct dentry *dentry, const char *symname) +static int hugetlbfs_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const char *symname) { struct inode *inode; int error = -ENOSPC; @@ -945,17 +953,6 @@ static int hugetlbfs_symlink(struct inode *dir, return error; } -/* - * mark the head page dirty - */ -static int hugetlbfs_set_page_dirty(struct page *page) -{ - struct page *head = compound_head(page); - - SetPageDirty(head); - return 0; -} - static int hugetlbfs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) @@ -966,15 +963,9 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - /* - * page_private is subpool pointer in hugetlb pages. Transfer to - * new page. PagePrivate is not associated with page_private for - * hugetlb pages and can not be set here as only page_huge_active - * pages can be migrated. - */ - if (page_private(page)) { - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); + if (hugetlb_page_subpool(page)) { + hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page)); + hugetlb_set_page_subpool(page, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -1149,7 +1140,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode) static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, - .set_page_dirty = hugetlbfs_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .migratepage = hugetlbfs_migrate_page, .error_remove_page = hugetlbfs_error_remove_page, }; @@ -1349,7 +1340,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) /* * Allocate and initialize subpool if maximum or minimum size is - * specified. Any needed reservations (for minimim size) are taken + * specified. Any needed reservations (for minimum size) are taken * taken when the subpool is created. */ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { @@ -1492,7 +1483,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode->i_size = size; clear_nlink(inode); - if (hugetlb_reserve_pages(inode, 0, + if (!hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, acctflag)) file = ERR_PTR(-ENOMEM); @@ -1526,8 +1517,8 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) put_fs_context(fc); } if (IS_ERR(mnt)) - pr_err("Cannot mount internal hugetlbfs for page size %uK", - 1U << (h->order + PAGE_SHIFT - 10)); + pr_err("Cannot mount internal hugetlbfs for page size %luK", + huge_page_size(h) >> 10); return mnt; } @@ -1555,7 +1546,7 @@ static int __init init_hugetlbfs_fs(void) goto out_free; /* default hstate mount is required */ - mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]); + mnt = mount_one_hugetlbfs(&default_hstate); if (IS_ERR(mnt)) { error = PTR_ERR(mnt); goto out_unreg; diff --git a/fs/init.c b/fs/init.c index e9c320a48cf1..5c36adaa9b44 100644 --- a/fs/init.c +++ b/fs/init.c @@ -49,7 +49,7 @@ int __init init_chdir(const char *filename) error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); if (error) return error; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &path); path_put(&path); @@ -64,7 +64,7 @@ int __init init_chroot(const char *filename) error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); if (error) return error; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; error = -EPERM; @@ -118,7 +118,7 @@ int __init init_eaccess(const char *filename) error = kern_path(filename, LOOKUP_FOLLOW, &path); if (error) return error; - error = inode_permission(d_inode(path.dentry), MAY_ACCESS); + error = path_permission(&path, MAY_ACCESS); path_put(&path); return error; } @@ -157,8 +157,8 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) mode &= ~current_umask(); error = security_path_mknod(&path, dentry, mode, dev); if (!error) - error = vfs_mknod(path.dentry->d_inode, dentry, mode, - new_decode_dev(dev)); + error = vfs_mknod(mnt_user_ns(path.mnt), path.dentry->d_inode, + dentry, mode, new_decode_dev(dev)); done_path_create(&path, dentry); return error; } @@ -167,6 +167,7 @@ int __init init_link(const char *oldname, const char *newname) { struct dentry *new_dentry; struct path old_path, new_path; + struct user_namespace *mnt_userns; int error; error = kern_path(oldname, 0, &old_path); @@ -181,14 +182,15 @@ int __init init_link(const char *oldname, const char *newname) error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - error = may_linkat(&old_path); + mnt_userns = mnt_user_ns(new_path.mnt); + error = may_linkat(mnt_userns, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, - NULL); + error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode, + new_dentry, NULL); out_dput: done_path_create(&new_path, new_dentry); out: @@ -207,7 +209,8 @@ int __init init_symlink(const char *oldname, const char *newname) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) - error = vfs_symlink(path.dentry->d_inode, dentry, oldname); + error = vfs_symlink(mnt_user_ns(path.mnt), path.dentry->d_inode, + dentry, oldname); done_path_create(&path, dentry); return error; } @@ -230,7 +233,8 @@ int __init init_mkdir(const char *pathname, umode_t mode) mode &= ~current_umask(); error = security_path_mkdir(&path, dentry, mode); if (!error) - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); + error = vfs_mkdir(mnt_user_ns(path.mnt), path.dentry->d_inode, + dentry, mode); done_path_create(&path, dentry); return error; } diff --git a/fs/inode.c b/fs/inode.c index 874242169547..a047ab306f9a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -142,6 +142,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; + inode->i_ino = 0; inode->__i_nlink = 1; inode->i_opflags = 0; if (sb->s_xattr) @@ -1798,7 +1799,7 @@ bool atime_needs_update(const struct path *path, struct inode *inode) /* Atime updates will likely cause i_uid and i_gid to be written * back improprely if their true value is unknown to the vfs. */ - if (HAS_UNMAPPED_ID(inode)) + if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode)) return false; if (IS_NOATIME(inode)) @@ -1905,7 +1906,8 @@ int dentry_needs_remove_privs(struct dentry *dentry) return mask; } -static int __remove_privs(struct dentry *dentry, int kill) +static int __remove_privs(struct user_namespace *mnt_userns, + struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1914,7 +1916,7 @@ static int __remove_privs(struct dentry *dentry, int kill) * Note we call this on write, so notify_change will not * encounter any conflicting delegations: */ - return notify_change(dentry, &newattrs, NULL); + return notify_change(mnt_userns, dentry, &newattrs, NULL); } /* @@ -1941,7 +1943,7 @@ int file_remove_privs(struct file *file) if (kill < 0) return kill; if (kill) - error = __remove_privs(dentry, kill); + error = __remove_privs(file_mnt_user_ns(file), dentry, kill); if (!error) inode_has_no_xattr(inode); @@ -2132,14 +2134,21 @@ EXPORT_SYMBOL(init_special_inode); /** * inode_init_owner - Init uid,gid,mode for new inode according to posix standards + * @mnt_userns: User namespace of the mount the inode was created from * @inode: New inode * @dir: Directory inode * @mode: mode of the new inode + * + * If the inode has been created through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions + * and initializing i_uid and i_gid. On non-idmapped mounts or if permission + * checking is to be performed on the raw inode simply passs init_user_ns. */ -void inode_init_owner(struct inode *inode, const struct inode *dir, - umode_t mode) +void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, + const struct inode *dir, umode_t mode) { - inode->i_uid = current_fsuid(); + inode->i_uid = fsuid_into_mnt(mnt_userns); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; @@ -2147,31 +2156,41 @@ void inode_init_owner(struct inode *inode, const struct inode *dir, if (S_ISDIR(mode)) mode |= S_ISGID; else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && - !in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(dir, CAP_FSETID)) + !in_group_p(i_gid_into_mnt(mnt_userns, dir)) && + !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID)) mode &= ~S_ISGID; } else - inode->i_gid = current_fsgid(); + inode->i_gid = fsgid_into_mnt(mnt_userns); inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); /** * inode_owner_or_capable - check current task permissions to inode + * @mnt_userns: user namespace of the mount the inode was found from * @inode: inode being checked * * Return true if current either has CAP_FOWNER in a namespace with the * inode owner uid mapped, or owns the file. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. */ -bool inode_owner_or_capable(const struct inode *inode) +bool inode_owner_or_capable(struct user_namespace *mnt_userns, + const struct inode *inode) { + kuid_t i_uid; struct user_namespace *ns; - if (uid_eq(current_fsuid(), inode->i_uid)) + i_uid = i_uid_into_mnt(mnt_userns, inode); + if (uid_eq(current_fsuid(), i_uid)) return true; ns = current_user_ns(); - if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER)) + if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } diff --git a/fs/internal.h b/fs/internal.h index 49bfb3750b22..6aeae7ef3380 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -74,7 +74,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); long do_rmdir(int dfd, struct filename *name); long do_unlinkat(int dfd, struct filename *name); -int may_linkat(struct path *link); +int may_linkat(struct user_namespace *mnt_userns, struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); diff --git a/fs/io-wq.c b/fs/io-wq.c index c36bbcd823ce..28868eb4cd09 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -13,13 +13,10 @@ #include <linux/sched/mm.h> #include <linux/percpu.h> #include <linux/slab.h> -#include <linux/kthread.h> #include <linux/rculist_nulls.h> -#include <linux/fs_struct.h> -#include <linux/task_work.h> -#include <linux/blk-cgroup.h> -#include <linux/audit.h> #include <linux/cpu.h> +#include <linux/tracehook.h> +#include <linux/freezer.h> #include "../kernel/sched/sched.h" #include "io-wq.h" @@ -36,7 +33,6 @@ enum { enum { IO_WQ_BIT_EXIT = 0, /* wq exiting */ - IO_WQ_BIT_ERROR = 1, /* error on setup */ }; enum { @@ -57,14 +53,9 @@ struct io_worker { struct io_wq_work *cur_work; spinlock_t lock; + struct completion ref_done; + struct rcu_head rcu; - struct mm_struct *mm; -#ifdef CONFIG_BLK_CGROUP - struct cgroup_subsys_state *blkcg_css; -#endif - const struct cred *cur_creds; - const struct cred *saved_creds; - struct nsproxy *restore_nsproxy; }; #if BITS_PER_LONG == 64 @@ -93,7 +84,6 @@ struct io_wqe { struct { raw_spinlock_t lock; struct io_wq_work_list work_list; - unsigned long hash_map; unsigned flags; } ____cacheline_aligned_in_smp; @@ -103,6 +93,8 @@ struct io_wqe { struct hlist_nulls_head free_list; struct list_head all_list; + struct wait_queue_entry wait; + struct io_wq *wq; struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; }; @@ -119,16 +111,33 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; + + struct io_wq_hash *hash; + refcount_t refs; - struct completion done; + struct completion exited; + + atomic_t worker_refs; + struct completion worker_done; struct hlist_node cpuhp_node; - refcount_t use_refs; + pid_t task_pid; }; static enum cpuhp_state io_wq_online; +struct io_cb_cancel_data { + work_cancel_fn *fn; + void *data; + int nr_running; + int nr_pending; + bool cancel_all; +}; + +static void io_wqe_cancel_pending_work(struct io_wqe *wqe, + struct io_cb_cancel_data *match); + static bool io_worker_get(struct io_worker *worker) { return refcount_inc_not_zero(&worker->ref); @@ -137,62 +146,7 @@ static bool io_worker_get(struct io_worker *worker) static void io_worker_release(struct io_worker *worker) { if (refcount_dec_and_test(&worker->ref)) - wake_up_process(worker->task); -} - -/* - * Note: drops the wqe->lock if returning true! The caller must re-acquire - * the lock in that case. Some callers need to restart handling if this - * happens, so we can't just re-acquire the lock on behalf of the caller. - */ -static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) -{ - bool dropped_lock = false; - - if (worker->saved_creds) { - revert_creds(worker->saved_creds); - worker->cur_creds = worker->saved_creds = NULL; - } - - if (current->files) { - __acquire(&wqe->lock); - raw_spin_unlock_irq(&wqe->lock); - dropped_lock = true; - - task_lock(current); - current->files = NULL; - current->nsproxy = worker->restore_nsproxy; - task_unlock(current); - } - - if (current->fs) - current->fs = NULL; - - /* - * If we have an active mm, we need to drop the wq lock before unusing - * it. If we do, return true and let the caller retry the idle loop. - */ - if (worker->mm) { - if (!dropped_lock) { - __acquire(&wqe->lock); - raw_spin_unlock_irq(&wqe->lock); - dropped_lock = true; - } - __set_current_state(TASK_RUNNING); - kthread_unuse_mm(worker->mm); - mmput(worker->mm); - worker->mm = NULL; - } - -#ifdef CONFIG_BLK_CGROUP - if (worker->blkcg_css) { - kthread_associate_blkcg(NULL); - worker->blkcg_css = NULL; - } -#endif - if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - return dropped_lock; + complete(&worker->ref_done); } static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, @@ -204,9 +158,10 @@ static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, return &wqe->acct[IO_WQ_ACCT_BOUND]; } -static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe, - struct io_worker *worker) +static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker) { + struct io_wqe *wqe = worker->wqe; + if (worker->flags & IO_WORKER_F_BOUND) return &wqe->acct[IO_WQ_ACCT_BOUND]; @@ -216,39 +171,33 @@ static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe, static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; - struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + unsigned flags; - /* - * If we're not at zero, someone else is holding a brief reference - * to the worker. Wait for that to go away. - */ - set_current_state(TASK_INTERRUPTIBLE); - if (!refcount_dec_and_test(&worker->ref)) - schedule(); - __set_current_state(TASK_RUNNING); + if (refcount_dec_and_test(&worker->ref)) + complete(&worker->ref_done); + wait_for_completion(&worker->ref_done); preempt_disable(); current->flags &= ~PF_IO_WORKER; - if (worker->flags & IO_WORKER_F_RUNNING) + flags = worker->flags; + worker->flags = 0; + if (flags & IO_WORKER_F_RUNNING) atomic_dec(&acct->nr_running); - if (!(worker->flags & IO_WORKER_F_BOUND)) - atomic_dec(&wqe->wq->user->processes); worker->flags = 0; preempt_enable(); raw_spin_lock_irq(&wqe->lock); - hlist_nulls_del_rcu(&worker->nulls_node); + if (flags & IO_WORKER_F_FREE) + hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - if (__io_worker_unuse(wqe, worker)) { - __release(&wqe->lock); - raw_spin_lock_irq(&wqe->lock); - } acct->nr_workers--; raw_spin_unlock_irq(&wqe->lock); kfree_rcu(worker, rcu); - if (refcount_dec_and_test(&wqe->wq->refs)) - complete(&wqe->wq->done); + if (atomic_dec_and_test(&wqe->wq->worker_refs)) + complete(&wqe->wq->worker_done); + do_exit(0); } static inline bool io_wqe_run_queue(struct io_wqe *wqe) @@ -306,35 +255,23 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) wake_up_process(wqe->wq->manager); } -static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker) +static void io_wqe_inc_running(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + struct io_wqe_acct *acct = io_wqe_get_acct(worker); atomic_inc(&acct->nr_running); } -static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker) +static void io_wqe_dec_running(struct io_worker *worker) __must_hold(wqe->lock) { - struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wqe *wqe = worker->wqe; if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) io_wqe_wake_worker(wqe, acct); } -static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) -{ - allow_kernel_signal(SIGINT); - - current->flags |= PF_IO_WORKER; - current->fs = NULL; - current->files = NULL; - - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); - worker->restore_nsproxy = current->nsproxy; - io_wqe_inc_running(wqe, worker); -} - /* * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist @@ -357,19 +294,17 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; if (worker_bound != work_bound) { - io_wqe_dec_running(wqe, worker); + io_wqe_dec_running(worker); if (work_bound) { worker->flags |= IO_WORKER_F_BOUND; wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--; wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++; - atomic_dec(&wqe->wq->user->processes); } else { worker->flags &= ~IO_WORKER_F_BOUND; wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++; wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--; - atomic_inc(&wqe->wq->user->processes); } - io_wqe_inc_running(wqe, worker); + io_wqe_inc_running(worker); } } @@ -380,15 +315,13 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, * retry the loop in that case (we changed task state), we don't regrab * the lock if we return success. */ -static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) +static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) __must_hold(wqe->lock) { if (!(worker->flags & IO_WORKER_F_FREE)) { worker->flags |= IO_WORKER_F_FREE; hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); } - - return __io_worker_unuse(wqe, worker); } static inline unsigned int io_get_work_hash(struct io_wq_work *work) @@ -396,14 +329,31 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work) return work->flags >> IO_WQ_HASH_SHIFT; } +static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) +{ + struct io_wq *wq = wqe->wq; + + spin_lock(&wq->hash->wait.lock); + if (list_empty(&wqe->wait.entry)) { + __add_wait_queue(&wq->hash->wait, &wqe->wait); + if (!test_bit(hash, &wq->hash->map)) { + __set_current_state(TASK_RUNNING); + list_del_init(&wqe->wait.entry); + } + } + spin_unlock(&wq->hash->wait.lock); +} + static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) __must_hold(wqe->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; - unsigned int hash; + unsigned int stall_hash = -1U; wq_list_for_each(node, prev, &wqe->work_list) { + unsigned int hash; + work = container_of(node, struct io_wq_work, list); /* not hashed, can run anytime */ @@ -412,111 +362,48 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) return work; } - /* hashed, can run if not already running */ hash = io_get_work_hash(work); - if (!(wqe->hash_map & BIT(hash))) { - wqe->hash_map |= BIT(hash); - /* all items with this hash lie in [work, tail] */ - tail = wqe->hash_tail[hash]; + /* all items with this hash lie in [work, tail] */ + tail = wqe->hash_tail[hash]; + + /* hashed, can run if not already running */ + if (!test_and_set_bit(hash, &wqe->wq->hash->map)) { wqe->hash_tail[hash] = NULL; wq_list_cut(&wqe->work_list, &tail->list, prev); return work; } + if (stall_hash == -1U) + stall_hash = hash; + /* fast forward to a next hash, for-each will fix up @prev */ + node = &tail->list; } - return NULL; -} - -static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) -{ - if (worker->mm) { - kthread_unuse_mm(worker->mm); - mmput(worker->mm); - worker->mm = NULL; - } - - if (mmget_not_zero(work->identity->mm)) { - kthread_use_mm(work->identity->mm); - worker->mm = work->identity->mm; - return; - } - - /* failed grabbing mm, ensure work gets cancelled */ - work->flags |= IO_WQ_WORK_CANCEL; -} - -static inline void io_wq_switch_blkcg(struct io_worker *worker, - struct io_wq_work *work) -{ -#ifdef CONFIG_BLK_CGROUP - if (!(work->flags & IO_WQ_WORK_BLKCG)) - return; - if (work->identity->blkcg_css != worker->blkcg_css) { - kthread_associate_blkcg(work->identity->blkcg_css); - worker->blkcg_css = work->identity->blkcg_css; + if (stall_hash != -1U) { + raw_spin_unlock(&wqe->lock); + io_wait_on_hash(wqe, stall_hash); + raw_spin_lock(&wqe->lock); } -#endif -} - -static void io_wq_switch_creds(struct io_worker *worker, - struct io_wq_work *work) -{ - const struct cred *old_creds = override_creds(work->identity->creds); - worker->cur_creds = work->identity->creds; - if (worker->saved_creds) - put_cred(old_creds); /* creds set by previous switch */ - else - worker->saved_creds = old_creds; + return NULL; } -static void io_impersonate_work(struct io_worker *worker, - struct io_wq_work *work) +static void io_flush_signals(void) { - if ((work->flags & IO_WQ_WORK_FILES) && - current->files != work->identity->files) { - task_lock(current); - current->files = work->identity->files; - current->nsproxy = work->identity->nsproxy; - task_unlock(current); - if (!work->identity->files) { - /* failed grabbing files, ensure work gets cancelled */ - work->flags |= IO_WQ_WORK_CANCEL; - } + if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) { + if (current->task_works) + task_work_run(); + clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL); } - if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) - current->fs = work->identity->fs; - if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm) - io_wq_switch_mm(worker, work); - if ((work->flags & IO_WQ_WORK_CREDS) && - worker->cur_creds != work->identity->creds) - io_wq_switch_creds(worker, work); - if (work->flags & IO_WQ_WORK_FSIZE) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; - else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - io_wq_switch_blkcg(worker, work); -#ifdef CONFIG_AUDIT - current->loginuid = work->identity->loginuid; - current->sessionid = work->identity->sessionid; -#endif } static void io_assign_current_work(struct io_worker *worker, struct io_wq_work *work) { if (work) { - /* flush pending signals before assigning new work */ - if (signal_pending(current)) - flush_signals(current); + io_flush_signals(); cond_resched(); } -#ifdef CONFIG_AUDIT - current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET); - current->sessionid = AUDIT_SID_UNSET; -#endif - spin_lock_irq(&worker->lock); worker->cur_work = work; spin_unlock_irq(&worker->lock); @@ -550,6 +437,7 @@ get_next: if (!work) break; io_assign_current_work(worker, work); + __set_current_state(TASK_RUNNING); /* handle a whole dependent link */ do { @@ -557,7 +445,6 @@ get_next: unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); - io_impersonate_work(worker, work); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -572,8 +459,10 @@ get_next: io_wqe_enqueue(wqe, linked); if (hash != -1U && !next_hashed) { + clear_bit(hash, &wq->hash->map); + if (wq_has_sleeper(&wq->hash->wait)) + wake_up(&wq->hash->wait); raw_spin_lock_irq(&wqe->lock); - wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) @@ -591,28 +480,29 @@ static int io_wqe_worker(void *data) struct io_worker *worker = data; struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; + char buf[TASK_COMM_LEN]; + + worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); + io_wqe_inc_running(worker); - io_worker_start(wqe, worker); + sprintf(buf, "iou-wrk-%d", wq->task_pid); + set_task_comm(current, buf); while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { set_current_state(TASK_INTERRUPTIBLE); loop: raw_spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { - __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); goto loop; } - /* drops the lock on success, retry */ - if (__io_worker_idle(wqe, worker)) { - __release(&wqe->lock); - goto loop; - } + __io_worker_idle(wqe, worker); raw_spin_unlock_irq(&wqe->lock); - if (signal_pending(current)) - flush_signals(current); + io_flush_signals(); if (schedule_timeout(WORKER_IDLE_TIMEOUT)) continue; + if (fatal_signal_pending(current)) + break; /* timed out, exit unless we're the fixed worker */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || !(worker->flags & IO_WORKER_F_FIXED)) @@ -636,15 +526,16 @@ loop: */ void io_wq_worker_running(struct task_struct *tsk) { - struct io_worker *worker = kthread_data(tsk); - struct io_wqe *wqe = worker->wqe; + struct io_worker *worker = tsk->pf_io_worker; + if (!worker) + return; if (!(worker->flags & IO_WORKER_F_UP)) return; if (worker->flags & IO_WORKER_F_RUNNING) return; worker->flags |= IO_WORKER_F_RUNNING; - io_wqe_inc_running(wqe, worker); + io_wqe_inc_running(worker); } /* @@ -654,9 +545,10 @@ void io_wq_worker_running(struct task_struct *tsk) */ void io_wq_worker_sleeping(struct task_struct *tsk) { - struct io_worker *worker = kthread_data(tsk); - struct io_wqe *wqe = worker->wqe; + struct io_worker *worker = tsk->pf_io_worker; + if (!worker) + return; if (!(worker->flags & IO_WORKER_F_UP)) return; if (!(worker->flags & IO_WORKER_F_RUNNING)) @@ -664,15 +556,18 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - raw_spin_lock_irq(&wqe->lock); - io_wqe_dec_running(wqe, worker); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_lock_irq(&worker->wqe->lock); + io_wqe_dec_running(worker); + raw_spin_unlock_irq(&worker->wqe->lock); } static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) { struct io_wqe_acct *acct = &wqe->acct[index]; struct io_worker *worker; + struct task_struct *tsk; + + __set_current_state(TASK_RUNNING); worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); if (!worker) @@ -682,14 +577,22 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) worker->nulls_node.pprev = NULL; worker->wqe = wqe; spin_lock_init(&worker->lock); + init_completion(&worker->ref_done); + + atomic_inc(&wq->worker_refs); - worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node, - "io_wqe_worker-%d/%d", index, wqe->node); - if (IS_ERR(worker->task)) { + tsk = create_io_thread(io_wqe_worker, worker, wqe->node); + if (IS_ERR(tsk)) { + if (atomic_dec_and_test(&wq->worker_refs)) + complete(&wq->worker_done); kfree(worker); return false; } - kthread_bind_mask(worker->task, cpumask_of_node(wqe->node)); + + tsk->pf_io_worker = worker; + worker->task = tsk; + set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node)); + tsk->flags |= PF_NOFREEZE | PF_NO_SETAFFINITY; raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); @@ -701,12 +604,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) worker->flags |= IO_WORKER_F_FIXED; acct->nr_workers++; raw_spin_unlock_irq(&wqe->lock); - - if (index == IO_WQ_ACCT_UNBOUND) - atomic_inc(&wq->user->processes); - - refcount_inc(&wq->refs); - wake_up_process(worker->task); + wake_up_new_task(tsk); return true; } @@ -715,6 +613,8 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) { struct io_wqe_acct *acct = &wqe->acct[index]; + if (acct->nr_workers && test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) + return false; /* if we have available workers or no work, no need */ if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe)) return false; @@ -748,97 +648,92 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, static bool io_wq_worker_wake(struct io_worker *worker, void *data) { + set_notify_signal(worker->task); wake_up_process(worker->task); return false; } -/* - * Manager thread. Tasked with creating new workers, if we need them. - */ -static int io_wq_manager(void *data) +static void io_wq_check_workers(struct io_wq *wq) { - struct io_wq *wq = data; int node; - /* create fixed workers */ - refcount_set(&wq->refs, 1); for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + bool fork_worker[2] = { false, false }; + if (!node_online(node)) continue; - if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) - continue; - set_bit(IO_WQ_BIT_ERROR, &wq->state); - set_bit(IO_WQ_BIT_EXIT, &wq->state); - goto out; + + raw_spin_lock_irq(&wqe->lock); + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) + fork_worker[IO_WQ_ACCT_BOUND] = true; + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) + fork_worker[IO_WQ_ACCT_UNBOUND] = true; + raw_spin_unlock_irq(&wqe->lock); + if (fork_worker[IO_WQ_ACCT_BOUND]) + create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); + if (fork_worker[IO_WQ_ACCT_UNBOUND]) + create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND); } +} - complete(&wq->done); +static bool io_wq_work_match_all(struct io_wq_work *work, void *data) +{ + return true; +} - while (!kthread_should_stop()) { - if (current->task_works) - task_work_run(); +static void io_wq_cancel_pending(struct io_wq *wq) +{ + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_all, + .cancel_all = true, + }; + int node; - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - bool fork_worker[2] = { false, false }; + for_each_node(node) + io_wqe_cancel_pending_work(wq->wqes[node], &match); +} + +/* + * Manager thread. Tasked with creating new workers, if we need them. + */ +static int io_wq_manager(void *data) +{ + struct io_wq *wq = data; + char buf[TASK_COMM_LEN]; + int node; - if (!node_online(node)) - continue; + sprintf(buf, "iou-mgr-%d", wq->task_pid); + set_task_comm(current, buf); - raw_spin_lock_irq(&wqe->lock); - if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) - fork_worker[IO_WQ_ACCT_BOUND] = true; - if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) - fork_worker[IO_WQ_ACCT_UNBOUND] = true; - raw_spin_unlock_irq(&wqe->lock); - if (fork_worker[IO_WQ_ACCT_BOUND]) - create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); - if (fork_worker[IO_WQ_ACCT_UNBOUND]) - create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND); - } + do { set_current_state(TASK_INTERRUPTIBLE); + io_wq_check_workers(wq); schedule_timeout(HZ); - } - - if (current->task_works) - task_work_run(); + try_to_freeze(); + if (fatal_signal_pending(current)) + set_bit(IO_WQ_BIT_EXIT, &wq->state); + } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); -out: - if (refcount_dec_and_test(&wq->refs)) { - complete(&wq->done); - return 0; - } - /* if ERROR is set and we get here, we have workers to wake */ - if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { - rcu_read_lock(); - for_each_node(node) - io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); - rcu_read_unlock(); - } - return 0; -} - -static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, - struct io_wq_work *work) -{ - bool free_worker; - - if (!(work->flags & IO_WQ_WORK_UNBOUND)) - return true; - if (atomic_read(&acct->nr_running)) - return true; + io_wq_check_workers(wq); rcu_read_lock(); - free_worker = !hlist_nulls_empty(&wqe->free_list); + for_each_node(node) + io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); rcu_read_unlock(); - if (free_worker) - return true; - if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers && - !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN))) - return false; + /* we might not ever have created any workers */ + if (atomic_read(&wq->worker_refs)) + wait_for_completion(&wq->worker_done); - return true; + spin_lock_irq(&wq->hash->wait.lock); + for_each_node(node) + list_del_init(&wq->wqes[node]->wait.entry); + spin_unlock_irq(&wq->hash->wait.lock); + + io_wq_cancel_pending(wq); + complete(&wq->exited); + do_exit(0); } static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) @@ -872,20 +767,35 @@ append: wq_list_add_after(&work->list, &tail->list, &wqe->work_list); } +static int io_wq_fork_manager(struct io_wq *wq) +{ + struct task_struct *tsk; + + if (wq->manager) + return 0; + + reinit_completion(&wq->worker_done); + tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE); + if (!IS_ERR(tsk)) { + wq->manager = get_task_struct(tsk); + wake_up_new_task(tsk); + return 0; + } + + return PTR_ERR(tsk); +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); int work_flags; unsigned long flags; - /* - * Do early check to see if we need a new unbound worker, and if we do, - * if we're allowed to do so. This isn't 100% accurate as there's a - * gap between this check and incrementing the value, but that's OK. - * It's close enough to not be an issue, fork() has the same delay. - */ - if (unlikely(!io_wq_can_queue(wqe, acct, work))) { - io_run_cancel(work, wqe); + /* Can only happen if manager creation fails after exec */ + if (io_wq_fork_manager(wqe->wq) || + test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) { + work->flags |= IO_WQ_WORK_CANCEL; + wqe->wq->do_work(work); return; } @@ -919,14 +829,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); } -struct io_cb_cancel_data { - work_cancel_fn *fn; - void *data; - int nr_running; - int nr_pending; - bool cancel_all; -}; - static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_cb_cancel_data *match = data; @@ -939,7 +841,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work && match->fn(worker->cur_work, match->data)) { - send_sig(SIGINT, worker->task, 1); + set_notify_signal(worker->task); match->nr_running++; } spin_unlock_irqrestore(&worker->lock, flags); @@ -1043,6 +945,24 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, return IO_WQ_CANCEL_NOTFOUND; } +static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, + int sync, void *key) +{ + struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); + int ret; + + list_del_init(&wait->entry); + + rcu_read_lock(); + ret = io_wqe_activate_free_worker(wqe); + rcu_read_unlock(); + + if (!ret) + wake_up_process(wqe->wq->manager); + + return 1; +} + struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret = -ENOMEM, node; @@ -1063,12 +983,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (ret) goto err_wqes; + refcount_inc(&data->hash->refs); + wq->hash = data->hash; wq->free_work = data->free_work; wq->do_work = data->do_work; - /* caller must already hold a reference to this */ - wq->user = data->user; - ret = -ENOMEM; for_each_node(node) { struct io_wqe *wqe; @@ -1083,11 +1002,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wqe->node = alloc_node; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); - if (wq->user) { - wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = + wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); - } atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); + wqe->wait.func = io_wqe_hash_wake; + INIT_LIST_HEAD(&wqe->wait.entry); wqe->wq = wq; raw_spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); @@ -1095,24 +1014,19 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) INIT_LIST_HEAD(&wqe->all_list); } - init_completion(&wq->done); + wq->task_pid = current->pid; + init_completion(&wq->exited); + refcount_set(&wq->refs, 1); - wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); - if (!IS_ERR(wq->manager)) { - wake_up_process(wq->manager); - wait_for_completion(&wq->done); - if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { - ret = -ENOMEM; - goto err; - } - refcount_set(&wq->use_refs, 1); - reinit_completion(&wq->done); + init_completion(&wq->worker_done); + atomic_set(&wq->worker_refs, 0); + + ret = io_wq_fork_manager(wq); + if (!ret) return wq; - } - ret = PTR_ERR(wq->manager); - complete(&wq->done); err: + io_wq_put_hash(data->hash); cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); for_each_node(node) kfree(wq->wqes[node]); @@ -1123,46 +1037,46 @@ err_wq: return ERR_PTR(ret); } -bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) +static void io_wq_destroy_manager(struct io_wq *wq) { - if (data->free_work != wq->free_work || data->do_work != wq->do_work) - return false; - - return refcount_inc_not_zero(&wq->use_refs); + if (wq->manager) { + wake_up_process(wq->manager); + wait_for_completion(&wq->exited); + put_task_struct(wq->manager); + wq->manager = NULL; + } } -static void __io_wq_destroy(struct io_wq *wq) +static void io_wq_destroy(struct io_wq *wq) { int node; cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); set_bit(IO_WQ_BIT_EXIT, &wq->state); - if (wq->manager) - kthread_stop(wq->manager); - - rcu_read_lock(); - for_each_node(node) - io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); - rcu_read_unlock(); + io_wq_destroy_manager(wq); - wait_for_completion(&wq->done); - - for_each_node(node) - kfree(wq->wqes[node]); + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + WARN_ON_ONCE(!wq_list_empty(&wqe->work_list)); + kfree(wqe); + } + io_wq_put_hash(wq->hash); kfree(wq->wqes); kfree(wq); } -void io_wq_destroy(struct io_wq *wq) +void io_wq_put(struct io_wq *wq) { - if (refcount_dec_and_test(&wq->use_refs)) - __io_wq_destroy(wq); + if (refcount_dec_and_test(&wq->refs)) + io_wq_destroy(wq); } -struct task_struct *io_wq_get_task(struct io_wq *wq) +void io_wq_put_and_exit(struct io_wq *wq) { - return wq->manager; + set_bit(IO_WQ_BIT_EXIT, &wq->state); + io_wq_destroy_manager(wq); + io_wq_put(wq); } static bool io_wq_worker_affinity(struct io_worker *worker, void *data) diff --git a/fs/io-wq.h b/fs/io-wq.h index 096f1021018e..5fbf7997149e 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -1,6 +1,7 @@ #ifndef INTERNAL_IO_WQ_H #define INTERNAL_IO_WQ_H +#include <linux/refcount.h> #include <linux/io_uring.h> struct io_wq; @@ -11,13 +12,6 @@ enum { IO_WQ_WORK_UNBOUND = 4, IO_WQ_WORK_CONCURRENT = 16, - IO_WQ_WORK_FILES = 32, - IO_WQ_WORK_FS = 64, - IO_WQ_WORK_MM = 128, - IO_WQ_WORK_CREDS = 256, - IO_WQ_WORK_BLKCG = 512, - IO_WQ_WORK_FSIZE = 1024, - IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -85,8 +79,8 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; - struct io_identity *identity; unsigned flags; + unsigned short personality; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) @@ -100,16 +94,27 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); typedef void (io_wq_work_fn)(struct io_wq_work *); -struct io_wq_data { - struct user_struct *user; +struct io_wq_hash { + refcount_t refs; + unsigned long map; + struct wait_queue_head wait; +}; + +static inline void io_wq_put_hash(struct io_wq_hash *hash) +{ + if (refcount_dec_and_test(&hash->refs)) + kfree(hash); +} +struct io_wq_data { + struct io_wq_hash *hash; io_wq_work_fn *do_work; free_work_fn *free_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); -bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); -void io_wq_destroy(struct io_wq *wq); +void io_wq_put(struct io_wq *wq); +void io_wq_put_and_exit(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); @@ -124,8 +129,6 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *); enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, void *data, bool cancel_all); -struct task_struct *io_wq_get_task(struct io_wq *wq); - #if defined(CONFIG_IO_WQ) extern void io_wq_worker_sleeping(struct task_struct *); extern void io_wq_worker_running(struct task_struct *); @@ -140,6 +143,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk) static inline bool io_wq_current_is_worker(void) { - return in_task() && (current->flags & PF_IO_WORKER); + return in_task() && (current->flags & PF_IO_WORKER) && + current->pf_io_worker; } #endif diff --git a/fs/io_uring.c b/fs/io_uring.c index 14ce789927e4..92c25b5f1349 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -57,7 +57,6 @@ #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> -#include <linux/kthread.h> #include <linux/blkdev.h> #include <linux/bvec.h> #include <linux/net.h> @@ -75,13 +74,11 @@ #include <linux/fsnotify.h> #include <linux/fadvise.h> #include <linux/eventpoll.h> -#include <linux/fs_struct.h> #include <linux/splice.h> #include <linux/task_work.h> #include <linux/pagemap.h> #include <linux/io_uring.h> -#include <linux/blk-cgroup.h> -#include <linux/audit.h> +#include <linux/freezer.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -104,6 +101,10 @@ #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ + IOSQE_BUFFER_SELECT) + struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; @@ -232,6 +233,7 @@ struct fixed_rsrc_data { struct fixed_rsrc_ref_node *node; struct percpu_ref refs; struct completion done; + bool quiesce; }; struct io_buffer { @@ -249,6 +251,11 @@ struct io_restriction { bool registered; }; +enum { + IO_SQ_THREAD_SHOULD_STOP = 0, + IO_SQ_THREAD_SHOULD_PARK, +}; + struct io_sq_data { refcount_t refs; struct mutex lock; @@ -262,6 +269,13 @@ struct io_sq_data { struct wait_queue_head wait; unsigned sq_thread_idle; + int sq_cpu; + pid_t task_pid; + + unsigned long state; + struct completion startup; + struct completion parked; + struct completion exited; }; #define IO_IOPOLL_BATCH 8 @@ -279,8 +293,14 @@ struct io_comp_state { struct list_head locked_free_list; }; +struct io_submit_link { + struct io_kiocb *head; + struct io_kiocb *last; +}; + struct io_submit_state { struct blk_plug plug; + struct io_submit_link link; /* * io_kiocb alloc cache @@ -312,12 +332,11 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int compat: 1; - unsigned int limit_mem: 1; unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; - unsigned int sqo_dead: 1; + unsigned int sqo_exec: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -339,6 +358,9 @@ struct io_ring_ctx { unsigned cached_cq_overflow; unsigned long sq_check_overflow; + /* hashed buffered write serialization */ + struct io_wq_hash *hash_map; + struct list_head defer_list; struct list_head timeout_list; struct list_head cq_overflow_list; @@ -355,22 +377,9 @@ struct io_ring_ctx { struct io_rings *rings; - /* IO offload */ - struct io_wq *io_wq; - - /* - * For SQPOLL usage - we hold a reference to the parent task, so we - * have access to the ->files - */ - struct task_struct *sqo_task; - /* Only used for accounting purposes */ struct mm_struct *mm_account; -#ifdef CONFIG_BLK_CGROUP - struct cgroup_subsys_state *sqo_blkcg_css; -#endif - struct io_sq_data *sq_data; /* if using sq thread polling */ struct wait_queue_head sqo_sq_wait; @@ -390,13 +399,6 @@ struct io_ring_ctx { struct user_struct *user; - const struct cred *creds; - -#ifdef CONFIG_AUDIT - kuid_t loginuid; - unsigned int sessionid; -#endif - struct completion ref_comp; struct completion sq_thread_comp; @@ -445,6 +447,11 @@ struct io_ring_ctx { struct io_restriction restrictions; + /* exit task_work */ + struct callback_head *exit_task_work; + + struct wait_queue_head hash_wait; + /* Keep this last, we don't need it for the fast path */ struct work_struct exit_work; }; @@ -673,7 +680,6 @@ enum { REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, - REQ_F_WORK_INITIALIZED_BIT, REQ_F_LTIMEOUT_ACTIVE_BIT, REQ_F_COMPLETE_INLINE_BIT, @@ -697,7 +703,7 @@ enum { /* fail rest of links */ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), - /* on inflight list */ + /* on inflight list, should be cancelled and waited on exit reliably */ REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), /* read/write uses file position */ REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), @@ -715,8 +721,6 @@ enum { REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), /* doesn't need file table for this request */ REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), - /* io_wq_work is initialized */ - REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), /* linked timeout is active, i.e. prepared by link's head */ REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), /* completion is deferred through io_comp_state */ @@ -827,7 +831,6 @@ struct io_op_def { unsigned plug : 1; /* size of async data needed, if any */ unsigned short async_size; - unsigned work_flags; }; static const struct io_op_def io_op_defs[] = { @@ -840,7 +843,6 @@ static const struct io_op_def io_op_defs[] = { .needs_async_data = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITEV] = { .needs_file = 1, @@ -850,12 +852,9 @@ static const struct io_op_def io_op_defs[] = { .needs_async_data = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FSIZE, }, [IORING_OP_FSYNC] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, @@ -863,7 +862,6 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -872,8 +870,6 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | - IO_WQ_WORK_MM, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -882,7 +878,6 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_POLL_REMOVE] = {}, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_SENDMSG] = { .needs_file = 1, @@ -890,8 +885,6 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, }, [IORING_OP_RECVMSG] = { .needs_file = 1, @@ -900,29 +893,23 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, }, [IORING_OP_TIMEOUT] = { .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_TIMEOUT_REMOVE] = { /* used by timeout updates' prep() */ - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_ACCEPT] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_CONNECT] = { .needs_file = 1, @@ -930,26 +917,14 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_connect), - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE, - }, - [IORING_OP_OPENAT] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS | IO_WQ_WORK_MM, - }, - [IORING_OP_CLOSE] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG, - }, - [IORING_OP_FILES_UPDATE] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM, - }, - [IORING_OP_STATX] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM | - IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, }, + [IORING_OP_OPENAT] = {}, + [IORING_OP_CLOSE] = {}, + [IORING_OP_FILES_UPDATE] = {}, + [IORING_OP_STATX] = {}, [IORING_OP_READ] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -957,7 +932,6 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITE] = { .needs_file = 1, @@ -965,42 +939,31 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FSIZE, }, [IORING_OP_FADVISE] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, - }, - [IORING_OP_MADVISE] = { - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, + [IORING_OP_MADVISE] = {}, [IORING_OP_SEND] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_RECV] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_OPENAT2] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | - IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, - .work_flags = IO_WQ_WORK_FILES, }, [IORING_OP_SPLICE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_PROVIDE_BUFFERS] = {}, [IORING_OP_REMOVE_BUFFERS] = {}, @@ -1012,24 +975,18 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_SHUTDOWN] = { .needs_file = 1, }, - [IORING_OP_RENAMEAT] = { - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | - IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, - }, - [IORING_OP_UNLINKAT] = { - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | - IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, - }, + [IORING_OP_RENAMEAT] = {}, + [IORING_OP_UNLINKAT] = {}, }; static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files); +static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx); static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node); static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node( struct io_ring_ctx *ctx); -static void init_fixed_file_ref_node(struct io_ring_ctx *ctx, - struct fixed_rsrc_ref_node *ref_node); +static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static bool io_rw_reissue(struct io_kiocb *req); static void io_cqring_fill_event(struct io_kiocb *req, long res); @@ -1112,190 +1069,20 @@ static bool io_match_task(struct io_kiocb *head, return true; io_for_each_link(req, head) { - if (!(req->flags & REQ_F_WORK_INITIALIZED)) - continue; - if (req->file && req->file->f_op == &io_uring_fops) + if (req->flags & REQ_F_INFLIGHT) return true; - if ((req->work.flags & IO_WQ_WORK_FILES) && - req->work.identity->files == files) + if (req->task->files == files) return true; } return false; } -static void io_sq_thread_drop_mm_files(void) -{ - struct files_struct *files = current->files; - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - current->mm = NULL; - } - if (files) { - struct nsproxy *nsproxy = current->nsproxy; - - task_lock(current); - current->files = NULL; - current->nsproxy = NULL; - task_unlock(current); - put_files_struct(files); - put_nsproxy(nsproxy); - } -} - -static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) -{ - if (!current->files) { - struct files_struct *files; - struct nsproxy *nsproxy; - - task_lock(ctx->sqo_task); - files = ctx->sqo_task->files; - if (!files) { - task_unlock(ctx->sqo_task); - return -EOWNERDEAD; - } - atomic_inc(&files->count); - get_nsproxy(ctx->sqo_task->nsproxy); - nsproxy = ctx->sqo_task->nsproxy; - task_unlock(ctx->sqo_task); - - task_lock(current); - current->files = files; - current->nsproxy = nsproxy; - task_unlock(current); - } - return 0; -} - -static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm; - - if (current->mm) - return 0; - - task_lock(ctx->sqo_task); - mm = ctx->sqo_task->mm; - if (unlikely(!mm || !mmget_not_zero(mm))) - mm = NULL; - task_unlock(ctx->sqo_task); - - if (mm) { - kthread_use_mm(mm); - return 0; - } - - return -EFAULT; -} - -static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - int ret; - - if (def->work_flags & IO_WQ_WORK_MM) { - ret = __io_sq_thread_acquire_mm(ctx); - if (unlikely(ret)) - return ret; - } - - if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) { - ret = __io_sq_thread_acquire_files(ctx); - if (unlikely(ret)) - return ret; - } - - return 0; -} - -static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (!(ctx->flags & IORING_SETUP_SQPOLL)) - return 0; - return __io_sq_thread_acquire_mm_files(ctx, req); -} - -static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, - struct cgroup_subsys_state **cur_css) - -{ -#ifdef CONFIG_BLK_CGROUP - /* puts the old one when swapping */ - if (*cur_css != ctx->sqo_blkcg_css) { - kthread_associate_blkcg(ctx->sqo_blkcg_css); - *cur_css = ctx->sqo_blkcg_css; - } -#endif -} - -static void io_sq_thread_unassociate_blkcg(void) -{ -#ifdef CONFIG_BLK_CGROUP - kthread_associate_blkcg(NULL); -#endif -} - static inline void req_set_fail_links(struct io_kiocb *req) { if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) req->flags |= REQ_F_FAIL_LINK; } -/* - * None of these are dereferenced, they are simply used to check if any of - * them have changed. If we're under current and check they are still the - * same, we're fine to grab references to them for actual out-of-line use. - */ -static void io_init_identity(struct io_identity *id) -{ - id->files = current->files; - id->mm = current->mm; -#ifdef CONFIG_BLK_CGROUP - rcu_read_lock(); - id->blkcg_css = blkcg_css(); - rcu_read_unlock(); -#endif - id->creds = current_cred(); - id->nsproxy = current->nsproxy; - id->fs = current->fs; - id->fsize = rlimit(RLIMIT_FSIZE); -#ifdef CONFIG_AUDIT - id->loginuid = current->loginuid; - id->sessionid = current->sessionid; -#endif - refcount_set(&id->count, 1); -} - -static inline void __io_req_init_async(struct io_kiocb *req) -{ - memset(&req->work, 0, sizeof(req->work)); - req->flags |= REQ_F_WORK_INITIALIZED; -} - -/* - * Note: must call io_req_init_async() for the first time you - * touch any members of io_wq_work. - */ -static inline void io_req_init_async(struct io_kiocb *req) -{ - struct io_uring_task *tctx = current->io_uring; - - if (req->flags & REQ_F_WORK_INITIALIZED) - return; - - __io_req_init_async(req); - - /* Grab a ref if this isn't our static identity */ - req->work.identity = tctx->identity; - if (tctx->identity != &tctx->__identity) - refcount_inc(&req->work.identity->count); -} - static void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); @@ -1378,111 +1165,11 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) -{ - if (req->work.identity == &tctx->__identity) - return; - if (refcount_dec_and_test(&req->work.identity->count)) - kfree(req->work.identity); -} - -static void io_req_clean_work(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_WORK_INITIALIZED)) - return; - - if (req->work.flags & IO_WQ_WORK_MM) - mmdrop(req->work.identity->mm); -#ifdef CONFIG_BLK_CGROUP - if (req->work.flags & IO_WQ_WORK_BLKCG) - css_put(req->work.identity->blkcg_css); -#endif - if (req->work.flags & IO_WQ_WORK_CREDS) - put_cred(req->work.identity->creds); - if (req->work.flags & IO_WQ_WORK_FS) { - struct fs_struct *fs = req->work.identity->fs; - - spin_lock(&req->work.identity->fs->lock); - if (--fs->users) - fs = NULL; - spin_unlock(&req->work.identity->fs->lock); - if (fs) - free_fs_struct(fs); - } - if (req->work.flags & IO_WQ_WORK_FILES) { - put_files_struct(req->work.identity->files); - put_nsproxy(req->work.identity->nsproxy); - } - if (req->flags & REQ_F_INFLIGHT) { - struct io_ring_ctx *ctx = req->ctx; - struct io_uring_task *tctx = req->task->io_uring; - unsigned long flags; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - req->flags &= ~REQ_F_INFLIGHT; - if (atomic_read(&tctx->in_idle)) - wake_up(&tctx->wait); - } - - req->flags &= ~REQ_F_WORK_INITIALIZED; - req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS | - IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES); - io_put_identity(req->task->io_uring, req); -} - -/* - * Create a private copy of io_identity, since some fields don't match - * the current context. - */ -static bool io_identity_cow(struct io_kiocb *req) -{ - struct io_uring_task *tctx = current->io_uring; - const struct cred *creds = NULL; - struct io_identity *id; - - if (req->work.flags & IO_WQ_WORK_CREDS) - creds = req->work.identity->creds; - - id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL); - if (unlikely(!id)) { - req->work.flags |= IO_WQ_WORK_CANCEL; - return false; - } - - /* - * We can safely just re-init the creds we copied Either the field - * matches the current one, or we haven't grabbed it yet. The only - * exception is ->creds, through registered personalities, so handle - * that one separately. - */ - io_init_identity(id); - if (creds) - id->creds = creds; - - /* add one for this request */ - refcount_inc(&id->count); - - /* drop tctx and req identity references, if needed */ - if (tctx->identity != &tctx->__identity && - refcount_dec_and_test(&tctx->identity->count)) - kfree(tctx->identity); - if (req->work.identity != &tctx->__identity && - refcount_dec_and_test(&req->work.identity->count)) - kfree(req->work.identity); - - req->work.identity = id; - tctx->identity = id; - return true; -} - static void io_req_track_inflight(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_INFLIGHT)) { - io_req_init_async(req); req->flags |= REQ_F_INFLIGHT; spin_lock_irq(&ctx->inflight_lock); @@ -1491,86 +1178,11 @@ static void io_req_track_inflight(struct io_kiocb *req) } } -static bool io_grab_identity(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_identity *id = req->work.identity; - - if (def->work_flags & IO_WQ_WORK_FSIZE) { - if (id->fsize != rlimit(RLIMIT_FSIZE)) - return false; - req->work.flags |= IO_WQ_WORK_FSIZE; - } -#ifdef CONFIG_BLK_CGROUP - if (!(req->work.flags & IO_WQ_WORK_BLKCG) && - (def->work_flags & IO_WQ_WORK_BLKCG)) { - rcu_read_lock(); - if (id->blkcg_css != blkcg_css()) { - rcu_read_unlock(); - return false; - } - /* - * This should be rare, either the cgroup is dying or the task - * is moving cgroups. Just punt to root for the handful of ios. - */ - if (css_tryget_online(id->blkcg_css)) - req->work.flags |= IO_WQ_WORK_BLKCG; - rcu_read_unlock(); - } -#endif - if (!(req->work.flags & IO_WQ_WORK_CREDS)) { - if (id->creds != current_cred()) - return false; - get_cred(id->creds); - req->work.flags |= IO_WQ_WORK_CREDS; - } -#ifdef CONFIG_AUDIT - if (!uid_eq(current->loginuid, id->loginuid) || - current->sessionid != id->sessionid) - return false; -#endif - if (!(req->work.flags & IO_WQ_WORK_FS) && - (def->work_flags & IO_WQ_WORK_FS)) { - if (current->fs != id->fs) - return false; - spin_lock(&id->fs->lock); - if (!id->fs->in_exec) { - id->fs->users++; - req->work.flags |= IO_WQ_WORK_FS; - } else { - req->work.flags |= IO_WQ_WORK_CANCEL; - } - spin_unlock(¤t->fs->lock); - } - if (!(req->work.flags & IO_WQ_WORK_FILES) && - (def->work_flags & IO_WQ_WORK_FILES) && - !(req->flags & REQ_F_NO_FILE_TABLE)) { - if (id->files != current->files || - id->nsproxy != current->nsproxy) - return false; - atomic_inc(&id->files->count); - get_nsproxy(id->nsproxy); - req->work.flags |= IO_WQ_WORK_FILES; - io_req_track_inflight(req); - } - if (!(req->work.flags & IO_WQ_WORK_MM) && - (def->work_flags & IO_WQ_WORK_MM)) { - if (id->mm != current->mm) - return false; - mmgrab(id->mm); - req->work.flags |= IO_WQ_WORK_MM; - } - - return true; -} - static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; - io_req_init_async(req); - if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; @@ -1581,17 +1193,6 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - - /* if we fail grabbing identity, we must COW, regrab, and retry */ - if (io_grab_identity(req)) - return; - - if (!io_identity_cow(req)) - return; - - /* can't fail at this point */ - if (!io_grab_identity(req)) - WARN_ON(1); } static void io_prep_async_link(struct io_kiocb *req) @@ -1602,25 +1203,20 @@ static void io_prep_async_link(struct io_kiocb *req) io_prep_async_work(cur); } -static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) +static void io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); + struct io_uring_task *tctx = req->task->io_uring; + + BUG_ON(!tctx); + BUG_ON(!tctx->io_wq); trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, &req->work, req->flags); - io_wq_enqueue(ctx->io_wq, &req->work); - return link; -} - -static void io_queue_async_work(struct io_kiocb *req) -{ - struct io_kiocb *link; - /* init ->work of the whole link before punting */ io_prep_async_link(req); - link = __io_queue_async_work(req); - + io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); } @@ -1855,18 +1451,22 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, return all_flushed; } -static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, struct task_struct *tsk, struct files_struct *files) { + bool ret = true; + if (test_bit(0, &ctx->cq_check_overflow)) { /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); - __io_cqring_overflow_flush(ctx, force, tsk, files); + ret = __io_cqring_overflow_flush(ctx, force, tsk, files); if (ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&ctx->uring_lock); } + + return ret; } static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) @@ -2048,9 +1648,19 @@ static void io_dismantle_req(struct io_kiocb *req) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); if (req->fixed_rsrc_refs) percpu_ref_put(req->fixed_rsrc_refs); - io_req_clean_work(req); + + if (req->flags & REQ_F_INFLIGHT) { + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; + } } +/* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; @@ -2134,15 +1744,7 @@ static void io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); io_cqring_fill_event(link, -ECANCELED); - /* - * It's ok to free under spinlock as they're not linked anymore, - * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on - * work.fs->lock. - */ - if (link->flags & REQ_F_WORK_INITIALIZED) - io_put_req_deferred(link, 2); - else - io_double_put_req(link); + io_put_req_deferred(link, 2); link = nxt; } io_commit_cqring(ctx); @@ -2179,6 +1781,18 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } +static void ctx_flush_and_put(struct io_ring_ctx *ctx) +{ + if (!ctx) + return; + if (ctx->submit_state.comp.nr) { + mutex_lock(&ctx->uring_lock); + io_submit_flush_completions(&ctx->submit_state.comp, ctx); + mutex_unlock(&ctx->uring_lock); + } + percpu_ref_put(&ctx->refs); +} + static bool __tctx_task_work(struct io_uring_task *tctx) { struct io_ring_ctx *ctx = NULL; @@ -2196,30 +1810,20 @@ static bool __tctx_task_work(struct io_uring_task *tctx) node = list.first; while (node) { struct io_wq_work_node *next = node->next; - struct io_ring_ctx *this_ctx; struct io_kiocb *req; req = container_of(node, struct io_kiocb, io_task_work.node); - this_ctx = req->ctx; - req->task_work.func(&req->task_work); - node = next; - - if (!ctx) { - ctx = this_ctx; - } else if (ctx != this_ctx) { - mutex_lock(&ctx->uring_lock); - io_submit_flush_completions(&ctx->submit_state.comp, ctx); - mutex_unlock(&ctx->uring_lock); - ctx = this_ctx; + if (req->ctx != ctx) { + ctx_flush_and_put(ctx); + ctx = req->ctx; + percpu_ref_get(&ctx->refs); } - } - if (ctx && ctx->submit_state.comp.nr) { - mutex_lock(&ctx->uring_lock); - io_submit_flush_completions(&ctx->submit_state.comp, ctx); - mutex_unlock(&ctx->uring_lock); + req->task_work.func(&req->task_work); + node = next; } + ctx_flush_and_put(ctx); return list.first != NULL; } @@ -2227,10 +1831,10 @@ static void tctx_task_work(struct callback_head *cb) { struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); + clear_bit(0, &tctx->task_state); + while (__tctx_task_work(tctx)) cond_resched(); - - clear_bit(0, &tctx->task_state); } static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req, @@ -2303,11 +1907,14 @@ static int io_req_task_work_add(struct io_kiocb *req) static void io_req_task_work_add_fallback(struct io_kiocb *req, task_work_func_t cb) { - struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq); + struct io_ring_ctx *ctx = req->ctx; + struct callback_head *head; init_task_work(&req->task_work, cb); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); + do { + head = READ_ONCE(ctx->exit_task_work); + req->task_work.next = head; + } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head); } static void __io_req_task_cancel(struct io_kiocb *req, int error) @@ -2329,7 +1936,9 @@ static void io_req_task_cancel(struct callback_head *cb) struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_ring_ctx *ctx = req->ctx; - __io_req_task_cancel(req, -ECANCELED); + mutex_lock(&ctx->uring_lock); + __io_req_task_cancel(req, req->result); + mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } @@ -2339,15 +1948,11 @@ static void __io_req_task_submit(struct io_kiocb *req) /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ mutex_lock(&ctx->uring_lock); - if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && - !io_sq_thread_acquire_mm_files(ctx, req)) + if (!(current->flags & PF_EXITING) && !current->in_execve) __io_queue_sqe(req); else __io_req_task_cancel(req, -EFAULT); mutex_unlock(&ctx->uring_lock); - - if (ctx->flags & IORING_SETUP_SQPOLL) - io_sq_thread_drop_mm_files(); } static void io_req_task_submit(struct callback_head *cb) @@ -2364,11 +1969,22 @@ static void io_req_task_queue(struct io_kiocb *req) req->task_work.func = io_req_task_submit; ret = io_req_task_work_add(req); if (unlikely(ret)) { + req->result = -ECANCELED; percpu_ref_get(&req->ctx->refs); io_req_task_work_add_fallback(req, io_req_task_cancel); } } +static void io_req_task_queue_fail(struct io_kiocb *req, int ret) +{ + percpu_ref_get(&req->ctx->refs); + req->result = ret; + req->task_work.func = io_req_task_cancel; + + if (unlikely(io_req_task_work_add(req))) + io_req_task_work_add_fallback(req, io_req_task_cancel); +} + static inline void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -2794,24 +2410,37 @@ static bool io_resubmit_prep(struct io_kiocb *req) return false; return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false); } -#endif -static bool io_rw_reissue(struct io_kiocb *req) +static bool io_rw_should_reissue(struct io_kiocb *req) { -#ifdef CONFIG_BLOCK umode_t mode = file_inode(req->file)->i_mode; - int ret; + struct io_ring_ctx *ctx = req->ctx; if (!S_ISBLK(mode) && !S_ISREG(mode)) return false; - if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker()) + if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && + !(ctx->flags & IORING_SETUP_IOPOLL))) return false; + /* + * If ref is dying, we might be running poll reap from the exit work. + * Don't attempt to reissue from that path, just let it fail with + * -EAGAIN. + */ + if (percpu_ref_is_dying(&ctx->refs)) + return false; + return true; +} +#endif - lockdep_assert_held(&req->ctx->uring_lock); +static bool io_rw_reissue(struct io_kiocb *req) +{ +#ifdef CONFIG_BLOCK + if (!io_rw_should_reissue(req)) + return false; - ret = io_sq_thread_acquire_mm_files(req->ctx, req); + lockdep_assert_held(&req->ctx->uring_lock); - if (!ret && io_resubmit_prep(req)) { + if (io_resubmit_prep(req)) { refcount_inc(&req->refs); io_queue_async_work(req); return true; @@ -2849,6 +2478,19 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); +#ifdef CONFIG_BLOCK + /* Rewind iter, if we have one. iopoll path resubmits as usual */ + if (res == -EAGAIN && io_rw_should_reissue(req)) { + struct io_async_rw *rw = req->async_data; + + if (rw) + iov_iter_revert(&rw->iter, + req->result - iov_iter_count(&rw->iter)); + else if (!io_resubmit_prep(req)) + res = -EIO; + } +#endif + if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); @@ -3467,19 +3109,9 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - ssize_t ret; - - ret = io_prep_rw(req, sqe); - if (ret) - return ret; - if (unlikely(!(req->file->f_mode & FMODE_READ))) return -EBADF; - - /* either don't need iovec imported or already have it */ - if (!req->async_data) - return 0; - return io_rw_prep_async(req, READ); + return io_prep_rw(req, sqe); } /* @@ -3607,10 +3239,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(req, iter); if (ret == -EIOCBQUEUED) { - /* it's faster to check here then delegate to kfree */ - if (iovec) - kfree(iovec); - return 0; + if (req->async_data) + iov_iter_revert(iter, io_size - iov_iter_count(iter)); + goto out_free; } else if (ret == -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -3631,6 +3262,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret2) return ret2; + iovec = NULL; rw = req->async_data; /* now use our persistent iterator, if we aren't already */ iter = &rw->iter; @@ -3654,27 +3286,22 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EIOCBQUEUED) return 0; /* we got some bytes, but not all. retry. */ + kiocb->ki_flags &= ~IOCB_WAITQ; } while (ret > 0 && ret < io_size); done: kiocb_done(kiocb, ret, issue_flags); +out_free: + /* it's faster to check here then delegate to kfree */ + if (iovec) + kfree(iovec); return 0; } static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - ssize_t ret; - - ret = io_prep_rw(req, sqe); - if (ret) - return ret; - if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - - /* either don't need iovec imported or already have it */ - if (!req->async_data) - return 0; - return io_rw_prep_async(req, WRITE); + return io_prep_rw(req, sqe); } static int io_write(struct io_kiocb *req, unsigned int issue_flags) @@ -3746,6 +3373,8 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) /* no retry on NONBLOCK nor RWF_NOWAIT */ if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) goto done; + if (ret2 == -EIOCBQUEUED && req->async_data) + iov_iter_revert(iter, io_size - iov_iter_count(iter)); if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) @@ -3924,7 +3553,6 @@ static int __io_splice_prep(struct io_kiocb *req, * Splice operation will be punted aync, and here need to * modify io_wq_work.flags, so initialize io_wq_work firstly. */ - io_req_init_async(req); req->work.flags |= IO_WQ_WORK_UNBOUND; } @@ -4011,7 +3639,7 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -4200,7 +3828,7 @@ err: static int io_openat(struct io_kiocb *req, unsigned int issue_flags) { - return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK); + return io_openat2(req, issue_flags); } static int io_remove_buffers_prep(struct io_kiocb *req, @@ -4598,13 +4226,10 @@ err: return 0; } -static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - if (!req->file) - return -EBADF; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) @@ -4664,11 +4289,21 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, req->sr_msg.msg_flags, &iomsg->free_iov); } +static int io_sendmsg_prep_async(struct io_kiocb *req) +{ + int ret; + + if (!io_op_defs[req->opcode].needs_async_data) + return 0; + ret = io_sendmsg_copy_hdr(req, req->async_data); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_async_msghdr *async_msg = req->async_data; struct io_sr_msg *sr = &req->sr_msg; - int ret; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4681,13 +4316,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - - if (!async_msg || !io_op_defs[req->opcode].needs_async_data) - return 0; - ret = io_sendmsg_copy_hdr(req, async_msg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return 0; } static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) @@ -4881,13 +4510,22 @@ static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) return io_put_kbuf(req, req->sr_msg.kbuf); } -static int io_recvmsg_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_recvmsg_prep_async(struct io_kiocb *req) { - struct io_async_msghdr *async_msg = req->async_data; - struct io_sr_msg *sr = &req->sr_msg; int ret; + if (!io_op_defs[req->opcode].needs_async_data) + return 0; + ret = io_recvmsg_copy_hdr(req, req->async_data); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; +} + +static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sr_msg *sr = &req->sr_msg; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4900,13 +4538,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - - if (!async_msg || !io_op_defs[req->opcode].needs_async_data) - return 0; - ret = io_recvmsg_copy_hdr(req, async_msg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return 0; } static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) @@ -5059,10 +4691,17 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static int io_connect_prep_async(struct io_kiocb *req) +{ + struct io_async_connect *io = req->async_data; + struct io_connect *conn = &req->connect; + + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); +} + static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; - struct io_async_connect *io = req->async_data; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -5071,12 +4710,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); conn->addr_len = READ_ONCE(sqe->addr2); - - if (!io) - return 0; - - return move_addr_to_kernel(conn->addr, conn->addr_len, - &io->address); + return 0; } static int io_connect(struct io_kiocb *req, unsigned int issue_flags) @@ -5121,56 +4755,32 @@ out: return 0; } #else /* !CONFIG_NET */ -static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - -static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_send(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_recvmsg_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - -static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_recv(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - -static int io_accept(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - -static int io_connect(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} +#define IO_NETOP_FN(op) \ +static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ +{ \ + return -EOPNOTSUPP; \ +} + +#define IO_NETOP_PREP(op) \ +IO_NETOP_FN(op) \ +static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ +{ \ + return -EOPNOTSUPP; \ +} \ + +#define IO_NETOP_PREP_ASYNC(op) \ +IO_NETOP_PREP(op) \ +static int io_##op##_prep_async(struct io_kiocb *req) \ +{ \ + return -EOPNOTSUPP; \ +} + +IO_NETOP_PREP_ASYNC(sendmsg); +IO_NETOP_PREP_ASYNC(recvmsg); +IO_NETOP_PREP_ASYNC(connect); +IO_NETOP_PREP(accept); +IO_NETOP_FN(send); +IO_NETOP_FN(recv); #endif /* CONFIG_NET */ struct io_poll_table { @@ -5357,6 +4967,9 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -EINVAL; return; } + /* double add on the same waitqueue head, ignore */ + if (poll->head == head) + return; poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; @@ -5892,6 +5505,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + io_req_track_inflight(req); return 0; } @@ -5952,12 +5566,15 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data) return req->user_data == (unsigned long) data; } -static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) +static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr) { enum io_wq_cancel cancel_ret; int ret = 0; - cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false); + if (!tctx->io_wq) + return -ENOENT; + + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; @@ -5980,7 +5597,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, unsigned long flags; int ret; - ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr); + ret = io_async_cancel_one(req->task->io_uring, + (void *) (unsigned long) sqe_addr); if (ret != -ENOENT) { spin_lock_irqsave(&ctx->completion_lock, flags); goto done; @@ -6084,9 +5702,9 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_POLL_REMOVE: return io_poll_remove_prep(req, sqe); case IORING_OP_FSYNC: - return io_prep_fsync(req, sqe); + return io_fsync_prep(req, sqe); case IORING_OP_SYNC_FILE_RANGE: - return io_prep_sfr(req, sqe); + return io_sfr_prep(req, sqe); case IORING_OP_SENDMSG: case IORING_OP_SEND: return io_sendmsg_prep(req, sqe); @@ -6144,14 +5762,39 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return-EINVAL; } -static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_req_prep_async(struct io_kiocb *req) +{ + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + return io_rw_prep_async(req, READ); + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + return io_rw_prep_async(req, WRITE); + case IORING_OP_SENDMSG: + case IORING_OP_SEND: + return io_sendmsg_prep_async(req); + case IORING_OP_RECVMSG: + case IORING_OP_RECV: + return io_recvmsg_prep_async(req); + case IORING_OP_CONNECT: + return io_connect_prep_async(req); + } + return 0; +} + +static int io_req_defer_prep(struct io_kiocb *req) { - if (!sqe) + if (!io_op_defs[req->opcode].needs_async_data) return 0; - if (io_alloc_async_data(req)) + /* some opcodes init it during the inital prep */ + if (req->async_data) + return 0; + if (__io_alloc_async_data(req)) return -EAGAIN; - return io_req_prep(req, sqe); + return io_req_prep_async(req); } static u32 io_get_sequence(struct io_kiocb *req) @@ -6167,7 +5810,7 @@ static u32 io_get_sequence(struct io_kiocb *req) return total_submitted - nr_reqs; } -static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_req_defer(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; @@ -6184,11 +5827,9 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; - if (!req->async_data) { - ret = io_req_defer_prep(req, sqe); - if (ret) - return ret; - } + ret = io_req_defer_prep(req); + if (ret) + return ret; io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); if (!de) @@ -6272,8 +5913,22 @@ static void __io_clean_op(struct io_kiocb *req) static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + const struct cred *creds = NULL; int ret; + if (req->work.personality) { + const struct cred *new_creds; + + if (!(issue_flags & IO_URING_F_NONBLOCK)) + mutex_lock(&ctx->uring_lock); + new_creds = idr_find(&ctx->personality_idr, req->work.personality); + if (!(issue_flags & IO_URING_F_NONBLOCK)) + mutex_unlock(&ctx->uring_lock); + if (!new_creds) + return -EINVAL; + creds = override_creds(new_creds); + } + switch (req->opcode) { case IORING_OP_NOP: ret = io_nop(req, issue_flags); @@ -6380,6 +6035,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) break; } + if (creds) + revert_creds(creds); + if (ret) return ret; @@ -6427,29 +6085,11 @@ static void io_wq_submit_work(struct io_wq_work *work) } while (1); } + /* avoid locking problems by failing it from a clean context */ if (ret) { - struct io_ring_ctx *lock_ctx = NULL; - - if (req->ctx->flags & IORING_SETUP_IOPOLL) - lock_ctx = req->ctx; - - /* - * io_iopoll_complete() does not hold completion_lock to - * complete polled io, so here for polled io, we can not call - * io_req_complete() directly, otherwise there maybe concurrent - * access to cqring, defer_list, etc, which is not safe. Given - * that io_iopoll_complete() is always called under uring_lock, - * so here for polled io, we also get uring_lock to complete - * it. - */ - if (lock_ctx) - mutex_lock(&lock_ctx->uring_lock); - - req_set_fail_links(req); - io_req_complete(req, ret); - - if (lock_ctx) - mutex_unlock(&lock_ctx->uring_lock); + /* io-wq is going to take one down */ + refcount_inc(&req->refs); + io_req_task_queue_fail(req, ret); } } @@ -6561,19 +6201,10 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req) { struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); - const struct cred *old_creds = NULL; int ret; - if ((req->flags & REQ_F_WORK_INITIALIZED) && - (req->work.flags & IO_WQ_WORK_CREDS) && - req->work.identity->creds != current_cred()) - old_creds = override_creds(req->work.identity->creds); - ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); - if (old_creds) - revert_creds(old_creds); - /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts @@ -6607,11 +6238,11 @@ static void __io_queue_sqe(struct io_kiocb *req) io_queue_linked_timeout(linked_timeout); } -static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void io_queue_sqe(struct io_kiocb *req) { int ret; - ret = io_req_defer(req, sqe); + ret = io_req_defer(req); if (ret) { if (ret != -EIOCBQUEUED) { fail_req: @@ -6620,42 +6251,133 @@ fail_req: io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { - if (!req->async_data) { - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; - } + ret = io_req_defer_prep(req); + if (unlikely(ret)) + goto fail_req; io_queue_async_work(req); } else { - if (sqe) { - ret = io_req_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; - } __io_queue_sqe(req); } } -static inline void io_queue_link_head(struct io_kiocb *req) +/* + * Check SQE restrictions (opcode and flags). + * + * Returns 'true' if SQE is allowed, 'false' otherwise. + */ +static inline bool io_check_restriction(struct io_ring_ctx *ctx, + struct io_kiocb *req, + unsigned int sqe_flags) { - if (unlikely(req->flags & REQ_F_FAIL_LINK)) { - io_put_req(req); - io_req_complete(req, -ECANCELED); - } else - io_queue_sqe(req, NULL); + if (!ctx->restricted) + return true; + + if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) + return false; + + if ((sqe_flags & ctx->restrictions.sqe_flags_required) != + ctx->restrictions.sqe_flags_required) + return false; + + if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | + ctx->restrictions.sqe_flags_required)) + return false; + + return true; } -struct io_submit_link { - struct io_kiocb *head; - struct io_kiocb *last; -}; +static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_submit_state *state; + unsigned int sqe_flags; + int ret = 0; + + req->opcode = READ_ONCE(sqe->opcode); + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags = sqe_flags = READ_ONCE(sqe->flags); + req->user_data = READ_ONCE(sqe->user_data); + req->async_data = NULL; + req->file = NULL; + req->ctx = ctx; + req->link = NULL; + req->fixed_rsrc_refs = NULL; + /* one is dropped after submission, the other at completion */ + refcount_set(&req->refs, 2); + req->task = current; + req->result = 0; + + /* enforce forwards compatibility on users */ + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { + req->flags = 0; + return -EINVAL; + } + + if (unlikely(req->opcode >= IORING_OP_LAST)) + return -EINVAL; + + if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) + return -EACCES; + + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[req->opcode].buffer_select) + return -EOPNOTSUPP; + + req->work.list.next = NULL; + req->work.flags = 0; + req->work.personality = READ_ONCE(sqe->personality); + state = &ctx->submit_state; + + /* + * Plug now if we have more than 1 IO left after this, and the target + * is potentially a read/write to block based storage. + */ + if (!state->plug_started && state->ios_left > 1 && + io_op_defs[req->opcode].plug) { + blk_start_plug(&state->plug); + state->plug_started = true; + } + + if (io_op_defs[req->opcode].needs_file) { + bool fixed = req->flags & REQ_F_FIXED_FILE; + + req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); + if (unlikely(!req->file)) + ret = -EBADF; + } + + state->ios_left--; + return ret; +} -static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_submit_link *link) +static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; + struct io_submit_link *link = &ctx->submit_state.link; int ret; + ret = io_init_req(ctx, req, sqe); + if (unlikely(ret)) { +fail_req: + io_put_req(req); + io_req_complete(req, ret); + if (link->head) { + /* fail even hard links since we don't submit */ + link->head->flags |= REQ_F_FAIL_LINK; + io_put_req(link->head); + io_req_complete(link->head, -ECANCELED); + link->head = NULL; + } + return ret; + } + ret = io_req_prep(req, sqe); + if (unlikely(ret)) + goto fail_req; + + /* don't need @sqe from now on */ + trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, + true, ctx->flags & IORING_SETUP_SQPOLL); + /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but @@ -6677,19 +6399,16 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret)) { - /* fail even hard links since we don't submit */ - head->flags |= REQ_F_FAIL_LINK; - return ret; - } + ret = io_req_defer_prep(req); + if (unlikely(ret)) + goto fail_req; trace_io_uring_link(ctx, req, head); link->last->link = req; link->last = req; /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - io_queue_link_head(head); + io_queue_sqe(head); link->head = NULL; } } else { @@ -6698,13 +6417,10 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ctx->drain_next = 0; } if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret)) - req->flags |= REQ_F_FAIL_LINK; link->head = req; link->last = req; } else { - io_queue_sqe(req, sqe); + io_queue_sqe(req); } } @@ -6717,6 +6433,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, static void io_submit_state_end(struct io_submit_state *state, struct io_ring_ctx *ctx) { + if (state->link.head) + io_queue_sqe(state->link.head); if (state->comp.nr) io_submit_flush_completions(&state->comp, ctx); if (state->plug_started) @@ -6732,6 +6450,8 @@ static void io_submit_state_start(struct io_submit_state *state, { state->plug_started = false; state->ios_left = max_ios; + /* set only head, no need to init link_last in advance */ + state->link.head = NULL; } static void io_commit_sqring(struct io_ring_ctx *ctx) @@ -6777,117 +6497,9 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) return NULL; } -/* - * Check SQE restrictions (opcode and flags). - * - * Returns 'true' if SQE is allowed, 'false' otherwise. - */ -static inline bool io_check_restriction(struct io_ring_ctx *ctx, - struct io_kiocb *req, - unsigned int sqe_flags) -{ - if (!ctx->restricted) - return true; - - if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) - return false; - - if ((sqe_flags & ctx->restrictions.sqe_flags_required) != - ctx->restrictions.sqe_flags_required) - return false; - - if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | - ctx->restrictions.sqe_flags_required)) - return false; - - return true; -} - -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ - IOSQE_BUFFER_SELECT) - -static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_submit_state *state; - unsigned int sqe_flags; - int id, ret = 0; - - req->opcode = READ_ONCE(sqe->opcode); - /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags = sqe_flags = READ_ONCE(sqe->flags); - req->user_data = READ_ONCE(sqe->user_data); - req->async_data = NULL; - req->file = NULL; - req->ctx = ctx; - req->link = NULL; - req->fixed_rsrc_refs = NULL; - /* one is dropped after submission, the other at completion */ - refcount_set(&req->refs, 2); - req->task = current; - req->result = 0; - - /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) - return -EINVAL; - - if (unlikely(req->opcode >= IORING_OP_LAST)) - return -EINVAL; - - if (unlikely(io_sq_thread_acquire_mm_files(ctx, req))) - return -EFAULT; - - if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) - return -EACCES; - - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[req->opcode].buffer_select) - return -EOPNOTSUPP; - - id = READ_ONCE(sqe->personality); - if (id) { - struct io_identity *iod; - - iod = idr_find(&ctx->personality_idr, id); - if (unlikely(!iod)) - return -EINVAL; - refcount_inc(&iod->count); - - __io_req_init_async(req); - get_cred(iod->creds); - req->work.identity = iod; - req->work.flags |= IO_WQ_WORK_CREDS; - } - - state = &ctx->submit_state; - - /* - * Plug now if we have more than 1 IO left after this, and the target - * is potentially a read/write to block based storage. - */ - if (!state->plug_started && state->ios_left > 1 && - io_op_defs[req->opcode].plug) { - blk_start_plug(&state->plug); - state->plug_started = true; - } - - if (io_op_defs[req->opcode].needs_file) { - bool fixed = req->flags & REQ_F_FIXED_FILE; - - req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); - if (unlikely(!req->file)) - ret = -EBADF; - } - - state->ios_left--; - return ret; -} - static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { - struct io_submit_link link; - int i, submitted = 0; + int submitted = 0; /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { @@ -6903,14 +6515,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) percpu_counter_add(¤t->io_uring->inflight, nr); refcount_add(nr, ¤t->usage); - io_submit_state_start(&ctx->submit_state, nr); - link.head = NULL; - for (i = 0; i < nr; i++) { + while (submitted < nr) { const struct io_uring_sqe *sqe; struct io_kiocb *req; - int err; req = io_alloc_req(ctx); if (unlikely(!req)) { @@ -6925,20 +6534,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) } /* will complete beyond this point, count as submitted */ submitted++; - - err = io_init_req(ctx, req, sqe); - if (unlikely(err)) { -fail_req: - io_put_req(req); - io_req_complete(req, err); + if (io_submit_sqe(ctx, req, sqe)) break; - } - - trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, - true, ctx->flags & IORING_SETUP_SQPOLL); - err = io_submit_sqe(req, sqe, &link); - if (err) - goto fail_req; } if (unlikely(submitted != nr)) { @@ -6950,10 +6547,8 @@ fail_req: percpu_counter_sub(&tctx->inflight, unused); put_task_struct_many(current, unused); } - if (link.head) - io_queue_link_head(link.head); - io_submit_state_end(&ctx->submit_state, ctx); + io_submit_state_end(&ctx->submit_state, ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); @@ -6992,8 +6587,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (!list_empty(&ctx->iopoll_list)) io_do_iopoll(ctx, &nr_events, 0); - if (to_submit && !ctx->sqo_dead && - likely(!percpu_ref_is_dying(&ctx->refs))) + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); } @@ -7030,71 +6624,94 @@ static void io_sqd_init_new(struct io_sq_data *sqd) io_sqd_update_thread_idle(sqd); } +static bool io_sq_thread_should_stop(struct io_sq_data *sqd) +{ + return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); +} + +static bool io_sq_thread_should_park(struct io_sq_data *sqd) +{ + return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); +} + +static void io_sq_thread_parkme(struct io_sq_data *sqd) +{ + for (;;) { + /* + * TASK_PARKED is a special state; we must serialize against + * possible pending wakeups to avoid store-store collisions on + * task->state. + * + * Such a collision might possibly result in the task state + * changin from TASK_PARKED and us failing the + * wait_task_inactive() in kthread_park(). + */ + set_special_state(TASK_PARKED); + if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) + break; + + /* + * Thread is going to call schedule(), do not preempt it, + * or the caller of kthread_park() may spend more time in + * wait_task_inactive(). + */ + preempt_disable(); + complete(&sqd->parked); + schedule_preempt_disabled(); + preempt_enable(); + } + __set_current_state(TASK_RUNNING); +} + static int io_sq_thread(void *data) { - struct cgroup_subsys_state *cur_css = NULL; - struct files_struct *old_files = current->files; - struct nsproxy *old_nsproxy = current->nsproxy; - const struct cred *old_cred = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; unsigned long timeout = 0; + char buf[TASK_COMM_LEN]; DEFINE_WAIT(wait); - task_lock(current); - current->files = NULL; - current->nsproxy = NULL; - task_unlock(current); + sprintf(buf, "iou-sqp-%d", sqd->task_pid); + set_task_comm(current, buf); + current->pf_io_worker = NULL; + + if (sqd->sq_cpu != -1) + set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); + else + set_cpus_allowed_ptr(current, cpu_online_mask); + current->flags |= PF_NO_SETAFFINITY; - while (!kthread_should_stop()) { + wait_for_completion(&sqd->startup); + + while (!io_sq_thread_should_stop(sqd)) { int ret; bool cap_entries, sqt_spin, needs_sched; /* * Any changes to the sqd lists are synchronized through the - * kthread parking. This synchronizes the thread vs users, + * thread parking. This synchronizes the thread vs users, * the users are synchronized on the sqd->ctx_lock. */ - if (kthread_should_park()) { - kthread_parkme(); - /* - * When sq thread is unparked, in case the previous park operation - * comes from io_put_sq_data(), which means that sq thread is going - * to be stopped, so here needs to have a check. - */ - if (kthread_should_stop()) - break; + if (io_sq_thread_should_park(sqd)) { + io_sq_thread_parkme(sqd); + continue; } - if (unlikely(!list_empty(&sqd->ctx_new_list))) { io_sqd_init_new(sqd); timeout = jiffies + sqd->sq_thread_idle; } - + if (fatal_signal_pending(current)) + break; sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - if (current->cred != ctx->creds) { - if (old_cred) - revert_creds(old_cred); - old_cred = override_creds(ctx->creds); - } - io_sq_thread_associate_blkcg(ctx, &cur_css); -#ifdef CONFIG_AUDIT - current->loginuid = ctx->loginuid; - current->sessionid = ctx->sessionid; -#endif - ret = __io_sq_thread(ctx, cap_entries); if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) sqt_spin = true; - - io_sq_thread_drop_mm_files(); } if (sqt_spin || !time_after(jiffies, timeout)) { io_run_task_work(); - io_sq_thread_drop_mm_files(); cond_resched(); if (sqt_spin) timeout = jiffies + sqd->sq_thread_idle; @@ -7115,11 +6732,12 @@ static int io_sq_thread(void *data) } } - if (needs_sched && !kthread_should_park()) { + if (needs_sched && !io_sq_thread_should_park(sqd)) { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); schedule(); + try_to_freeze(); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); } @@ -7128,22 +6746,32 @@ static int io_sq_thread(void *data) timeout = jiffies + sqd->sq_thread_idle; } - io_run_task_work(); - io_sq_thread_drop_mm_files(); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_uring_cancel_sqpoll(ctx); - if (cur_css) - io_sq_thread_unassociate_blkcg(); - if (old_cred) - revert_creds(old_cred); + io_run_task_work(); - task_lock(current); - current->files = old_files; - current->nsproxy = old_nsproxy; - task_unlock(current); + /* + * Ensure that we park properly if racing with someone trying to park + * while we're exiting. If we fail to grab the lock, check park and + * park if necessary. The ordering with the park bit and the lock + * ensures that we catch this reliably. + */ + if (!mutex_trylock(&sqd->lock)) { + if (io_sq_thread_should_park(sqd)) + io_sq_thread_parkme(sqd); + mutex_lock(&sqd->lock); + } - kthread_parkme(); + sqd->thread = NULL; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + ctx->sqo_exec = 1; + io_ring_set_wakeup_flag(ctx); + } - return 0; + complete(&sqd->exited); + mutex_unlock(&sqd->lock); + do_exit(0); } struct io_wait_queue { @@ -7264,11 +6892,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); trace_io_uring_cqring_wait(ctx, min_events); do { - io_cqring_overflow_flush(ctx, false, NULL, NULL); + /* if we can't even flush overflow, don't wait for more */ + if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) { + ret = -EBUSY; + break; + } prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); finish_wait(&ctx->wait, &iowq.wq); + cond_resched(); } while (ret > 0); restore_saved_sigmask_unless(ret == -EINTR); @@ -7328,38 +6961,59 @@ static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx, percpu_ref_get(&rsrc_data->refs); } -static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data, - struct io_ring_ctx *ctx, - struct fixed_rsrc_ref_node *backup_node) +static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data) { - struct fixed_rsrc_ref_node *ref_node; - int ret; + struct fixed_rsrc_ref_node *ref_node = NULL; io_rsrc_ref_lock(ctx); ref_node = data->node; + data->node = NULL; io_rsrc_ref_unlock(ctx); if (ref_node) percpu_ref_kill(&ref_node->refs); +} + +static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data, + struct io_ring_ctx *ctx, + void (*rsrc_put)(struct io_ring_ctx *ctx, + struct io_rsrc_put *prsrc)) +{ + struct fixed_rsrc_ref_node *backup_node; + int ret; - percpu_ref_kill(&data->refs); + if (data->quiesce) + return -ENXIO; - /* wait for all refs nodes to complete */ - flush_delayed_work(&ctx->rsrc_put_work); + data->quiesce = true; do { + ret = -ENOMEM; + backup_node = alloc_fixed_rsrc_ref_node(ctx); + if (!backup_node) + break; + backup_node->rsrc_data = data; + backup_node->rsrc_put = rsrc_put; + + io_sqe_rsrc_kill_node(ctx, data); + percpu_ref_kill(&data->refs); + flush_delayed_work(&ctx->rsrc_put_work); + ret = wait_for_completion_interruptible(&data->done); if (!ret) break; + + percpu_ref_resurrect(&data->refs); + io_sqe_rsrc_set_node(ctx, data, backup_node); + backup_node = NULL; + reinit_completion(&data->done); + mutex_unlock(&ctx->uring_lock); ret = io_run_task_work_sig(); - if (ret < 0) { - percpu_ref_resurrect(&data->refs); - reinit_completion(&data->done); - io_sqe_rsrc_set_node(ctx, data, backup_node); - return ret; - } - } while (1); + mutex_lock(&ctx->uring_lock); + } while (ret >= 0); + data->quiesce = false; - destroy_fixed_rsrc_ref_node(backup_node); - return 0; + if (backup_node) + destroy_fixed_rsrc_ref_node(backup_node); + return ret; } static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx) @@ -7390,18 +7044,17 @@ static void free_fixed_rsrc_data(struct fixed_rsrc_data *data) static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { struct fixed_rsrc_data *data = ctx->file_data; - struct fixed_rsrc_ref_node *backup_node; unsigned nr_tables, i; int ret; - if (!data) + /* + * percpu_ref_is_dying() is to stop parallel files unregister + * Since we possibly drop uring lock later in this function to + * run task work. + */ + if (!data || percpu_ref_is_dying(&data->refs)) return -ENXIO; - backup_node = alloc_fixed_rsrc_ref_node(ctx); - if (!backup_node) - return -ENOMEM; - init_fixed_file_ref_node(ctx, backup_node); - - ret = io_rsrc_ref_quiesce(data, ctx, backup_node); + ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put); if (ret) return ret; @@ -7415,20 +7068,76 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return 0; } +static void io_sq_thread_unpark(struct io_sq_data *sqd) + __releases(&sqd->lock) +{ + if (sqd->thread == current) + return; + clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + if (sqd->thread) + wake_up_state(sqd->thread, TASK_PARKED); + mutex_unlock(&sqd->lock); +} + +static void io_sq_thread_park(struct io_sq_data *sqd) + __acquires(&sqd->lock) +{ + if (sqd->thread == current) + return; + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + mutex_lock(&sqd->lock); + if (sqd->thread) { + wake_up_process(sqd->thread); + wait_for_completion(&sqd->parked); + } +} + +static void io_sq_thread_stop(struct io_sq_data *sqd) +{ + if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) + return; + mutex_lock(&sqd->lock); + if (sqd->thread) { + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)); + wake_up_process(sqd->thread); + mutex_unlock(&sqd->lock); + wait_for_completion(&sqd->exited); + WARN_ON_ONCE(sqd->thread); + } else { + mutex_unlock(&sqd->lock); + } +} + static void io_put_sq_data(struct io_sq_data *sqd) { if (refcount_dec_and_test(&sqd->refs)) { - /* - * The park is a bit of a work-around, without it we get - * warning spews on shutdown with SQPOLL set and affinity - * set to a single CPU. - */ + io_sq_thread_stop(sqd); + kfree(sqd); + } +} + +static void io_sq_thread_finish(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + + if (sqd) { + complete(&sqd->startup); if (sqd->thread) { - kthread_park(sqd->thread); - kthread_stop(sqd->thread); + wait_for_completion(&ctx->sq_thread_comp); + io_sq_thread_park(sqd); } - kfree(sqd); + mutex_lock(&sqd->ctx_lock); + list_del(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); + mutex_unlock(&sqd->ctx_lock); + + if (sqd->thread) + io_sq_thread_unpark(sqd); + + io_put_sq_data(sqd); + ctx->sq_data = NULL; } } @@ -7475,68 +7184,12 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) mutex_init(&sqd->ctx_lock); mutex_init(&sqd->lock); init_waitqueue_head(&sqd->wait); + init_completion(&sqd->startup); + init_completion(&sqd->parked); + init_completion(&sqd->exited); return sqd; } -static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->lock) -{ - if (!sqd->thread) - return; - kthread_unpark(sqd->thread); - mutex_unlock(&sqd->lock); -} - -static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->lock) -{ - if (!sqd->thread) - return; - mutex_lock(&sqd->lock); - kthread_park(sqd->thread); -} - -static void io_sq_thread_stop(struct io_ring_ctx *ctx) -{ - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd) { - if (sqd->thread) { - /* - * We may arrive here from the error branch in - * io_sq_offload_create() where the kthread is created - * without being waked up, thus wake it up now to make - * sure the wait will complete. - */ - wake_up_process(sqd->thread); - wait_for_completion(&ctx->sq_thread_comp); - - io_sq_thread_park(sqd); - } - - mutex_lock(&sqd->ctx_lock); - list_del(&ctx->sqd_list); - io_sqd_update_thread_idle(sqd); - mutex_unlock(&sqd->ctx_lock); - - if (sqd->thread) - io_sq_thread_unpark(sqd); - - io_put_sq_data(sqd); - ctx->sq_data = NULL; - } -} - -static void io_finish_async(struct io_ring_ctx *ctx) -{ - io_sq_thread_stop(ctx); - - if (ctx->io_wq) { - io_wq_destroy(ctx->io_wq); - ctx->io_wq = NULL; - } -} - #if defined(CONFIG_UNIX) /* * Ensure the UNIX gc is aware of our file set, so we are certain that @@ -7563,7 +7216,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) skb->sk = sk; nr_files = 0; - fpl->user = get_uid(ctx->user); + fpl->user = get_uid(current_user()); for (i = 0; i < nr; i++) { struct file *file = io_file_from_index(ctx, i + offset); @@ -8095,54 +7748,34 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work) return req ? &req->work : NULL; } -static int io_init_wq_offload(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx) { + struct io_wq_hash *hash; struct io_wq_data data; - struct fd f; - struct io_ring_ctx *ctx_attach; unsigned int concurrency; - int ret = 0; - - data.user = ctx->user; - data.free_work = io_free_work; - data.do_work = io_wq_submit_work; - if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - - ctx->io_wq = io_wq_create(concurrency, &data); - if (IS_ERR(ctx->io_wq)) { - ret = PTR_ERR(ctx->io_wq); - ctx->io_wq = NULL; - } - return ret; + hash = ctx->hash_map; + if (!hash) { + hash = kzalloc(sizeof(*hash), GFP_KERNEL); + if (!hash) + return ERR_PTR(-ENOMEM); + refcount_set(&hash->refs, 1); + init_waitqueue_head(&hash->wait); + ctx->hash_map = hash; } - f = fdget(p->wq_fd); - if (!f.file) - return -EBADF; - - if (f.file->f_op != &io_uring_fops) { - ret = -EINVAL; - goto out_fput; - } + data.hash = hash; + data.free_work = io_free_work; + data.do_work = io_wq_submit_work; - ctx_attach = f.file->private_data; - /* @io_wq is protected by holding the fd */ - if (!io_wq_get(ctx_attach->io_wq, &data)) { - ret = -EINVAL; - goto out_fput; - } + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = ctx_attach->io_wq; -out_fput: - fdput(f); - return ret; + return io_wq_create(concurrency, &data); } -static int io_uring_alloc_task_context(struct task_struct *task) +static int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; @@ -8157,13 +7790,19 @@ static int io_uring_alloc_task_context(struct task_struct *task) return ret; } + tctx->io_wq = io_init_wq_offload(ctx); + if (IS_ERR(tctx->io_wq)) { + ret = PTR_ERR(tctx->io_wq); + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); + return ret; + } + xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); tctx->last = NULL; atomic_set(&tctx->in_idle, 0); tctx->sqpoll = false; - io_init_identity(&tctx->__identity); - tctx->identity = &tctx->__identity; task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -8177,20 +7816,54 @@ void __io_uring_free(struct task_struct *tsk) struct io_uring_task *tctx = tsk->io_uring; WARN_ON_ONCE(!xa_empty(&tctx->xa)); - WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1); - if (tctx->identity != &tctx->__identity) - kfree(tctx->identity); + WARN_ON_ONCE(tctx->io_wq); + percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; } +static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx) +{ + struct task_struct *tsk; + int ret; + + clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + reinit_completion(&sqd->parked); + ctx->sqo_exec = 0; + sqd->task_pid = current->pid; + tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); + if (IS_ERR(tsk)) + return PTR_ERR(tsk); + ret = io_uring_alloc_task_context(tsk, ctx); + if (ret) + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + sqd->thread = tsk; + wake_up_new_task(tsk); + return ret; +} + static int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { int ret; + /* Retain compatibility with failing for an invalid attach attempt */ + if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == + IORING_SETUP_ATTACH_WQ) { + struct fd f; + + f = fdget(p->wq_fd); + if (!f.file) + return -ENXIO; + if (f.file->f_op != &io_uring_fops) { + fdput(f); + return -EINVAL; + } + fdput(f); + } if (ctx->flags & IORING_SETUP_SQPOLL) { + struct task_struct *tsk; struct io_sq_data *sqd; ret = -EPERM; @@ -8215,7 +7888,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ctx->sq_thread_idle = HZ; if (sqd->thread) - goto done; + return 0; if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; @@ -8226,18 +7899,22 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, if (!cpu_online(cpu)) goto err; - sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd, - cpu, "io_uring-sq"); + sqd->sq_cpu = cpu; } else { - sqd->thread = kthread_create(io_sq_thread, sqd, - "io_uring-sq"); + sqd->sq_cpu = -1; } - if (IS_ERR(sqd->thread)) { - ret = PTR_ERR(sqd->thread); - sqd->thread = NULL; + + sqd->task_pid = current->pid; + tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); goto err; } - ret = io_uring_alloc_task_context(sqd->thread); + ret = io_uring_alloc_task_context(tsk, ctx); + if (ret) + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + sqd->thread = tsk; + wake_up_new_task(tsk); if (ret) goto err; } else if (p->flags & IORING_SETUP_SQ_AFF) { @@ -8246,14 +7923,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, goto err; } -done: - ret = io_init_wq_offload(ctx, p); - if (ret) - goto err; - return 0; err: - io_finish_async(ctx); + io_sq_thread_finish(ctx); return ret; } @@ -8261,8 +7933,9 @@ static void io_sq_offload_start(struct io_ring_ctx *ctx) { struct io_sq_data *sqd = ctx->sq_data; - if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread) - wake_up_process(sqd->thread); + ctx->flags &= ~IORING_SETUP_R_DISABLED; + if (ctx->flags & IORING_SETUP_SQPOLL) + complete(&sqd->startup); } static inline void __io_unaccount_mem(struct user_struct *user, @@ -8292,7 +7965,7 @@ static inline int __io_account_mem(struct user_struct *user, static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->limit_mem) + if (ctx->user) __io_unaccount_mem(ctx->user, nr_pages); if (ctx->mm_account) @@ -8303,7 +7976,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { int ret; - if (ctx->limit_mem) { + if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; @@ -8699,22 +8372,26 @@ static void io_req_cache_free(struct list_head *list, struct task_struct *tsk) } } -static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk) +static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_submit_state *submit_state = &ctx->submit_state; + struct io_comp_state *cs = &ctx->submit_state.comp; mutex_lock(&ctx->uring_lock); - if (submit_state->free_reqs) + if (submit_state->free_reqs) { kmem_cache_free_bulk(req_cachep, submit_state->free_reqs, submit_state->reqs); - - io_req_cache_free(&submit_state->comp.free_list, NULL); + submit_state->free_reqs = 0; + } spin_lock_irq(&ctx->completion_lock); - io_req_cache_free(&submit_state->comp.locked_free_list, NULL); + list_splice_init(&cs->locked_free_list, &cs->free_list); + cs->locked_free_nr = 0; spin_unlock_irq(&ctx->completion_lock); + io_req_cache_free(&cs->free_list, NULL); + mutex_unlock(&ctx->uring_lock); } @@ -8728,22 +8405,17 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock); - io_finish_async(ctx); + io_sq_thread_finish(ctx); io_sqe_buffers_unregister(ctx); - if (ctx->sqo_task) { - put_task_struct(ctx->sqo_task); - ctx->sqo_task = NULL; + if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } -#ifdef CONFIG_BLK_CGROUP - if (ctx->sqo_blkcg_css) - css_put(ctx->sqo_blkcg_css); -#endif - + mutex_lock(&ctx->uring_lock); io_sqe_files_unregister(ctx); + mutex_unlock(&ctx->uring_lock); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); idr_destroy(&ctx->personality_idr); @@ -8760,8 +8432,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) percpu_ref_exit(&ctx->refs); free_uid(ctx->user); - put_cred(ctx->creds); - io_req_caches_free(ctx, NULL); + io_req_caches_free(ctx); + if (ctx->hash_map) + io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_hash); kfree(ctx); } @@ -8808,13 +8481,11 @@ static int io_uring_fasync(int fd, struct file *file, int on) static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { - struct io_identity *iod; + const struct cred *creds; - iod = idr_remove(&ctx->personality_idr, id); - if (iod) { - put_cred(iod->creds); - if (refcount_dec_and_test(&iod->count)) - kfree(iod); + creds = idr_remove(&ctx->personality_idr, id); + if (creds) { + put_cred(creds); return 0; } @@ -8829,6 +8500,28 @@ static int io_remove_personalities(int id, void *p, void *data) return 0; } +static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) +{ + struct callback_head *work, *next; + bool executed = false; + + do { + work = xchg(&ctx->exit_task_work, NULL); + if (!work) + break; + + do { + next = work->next; + work->func(work); + work = next; + cond_resched(); + } while (work); + executed = true; + } while (1); + + return executed; +} + static void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, @@ -8846,21 +8539,10 @@ static void io_ring_exit_work(struct work_struct *work) io_ring_ctx_free(ctx); } -static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - return req->ctx == data; -} - static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); - - if (WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && !ctx->sqo_dead)) - ctx->sqo_dead = 1; - /* if force is set, the ring is going away. always drop after that */ ctx->cq_overflow_flushed = 1; if (ctx->rings) @@ -8871,9 +8553,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_kill_timeouts(ctx, NULL, NULL); io_poll_remove_all(ctx, NULL, NULL); - if (ctx->io_wq) - io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true); - /* if we failed setting up the ctx, we might not have any rings */ io_iopoll_try_reap_events(ctx); @@ -8952,13 +8631,15 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct files_struct *files) { struct io_task_cancel cancel = { .task = task, .files = files, }; + struct task_struct *tctx_task = task ?: current; + struct io_uring_task *tctx = tctx_task->io_uring; while (1) { enum io_wq_cancel cret; bool ret = false; - if (ctx->io_wq) { - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, + if (tctx && tctx->io_wq) { + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, &cancel, true); ret |= (cret != IO_WQ_CANCEL_NOTFOUND); } @@ -8974,6 +8655,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_poll_remove_all(ctx, task, files); ret |= io_kill_timeouts(ctx, task, files); ret |= io_run_task_work(); + ret |= io_run_ctx_fallback(ctx); io_cqring_overflow_flush(ctx, true, task, files); if (!ret) break; @@ -9021,17 +8703,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } -static void io_disable_sqo_submit(struct io_ring_ctx *ctx) -{ - mutex_lock(&ctx->uring_lock); - ctx->sqo_dead = 1; - mutex_unlock(&ctx->uring_lock); - - /* make sure callers enter the ring to get error */ - if (ctx->rings) - io_ring_set_wakeup_flag(ctx); -} - /* * We need to iteratively cancel requests, in case a request has dependent * hard links. These persist even for failure of cancelations, hence keep @@ -9043,10 +8714,15 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, struct task_struct *task = current; if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { - io_disable_sqo_submit(ctx); - task = ctx->sq_data->thread; - atomic_inc(&task->io_uring->in_idle); + /* never started, nothing to cancel */ + if (ctx->flags & IORING_SETUP_R_DISABLED) { + io_sq_offload_start(ctx); + return; + } io_sq_thread_park(ctx->sq_data); + task = ctx->sq_data->thread; + if (task) + atomic_inc(&task->io_uring->in_idle); } io_cancel_defer_files(ctx, task, files); @@ -9055,10 +8731,10 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, if (!files) io_uring_try_cancel_requests(ctx, task, NULL); - if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { + if (task) atomic_dec(&task->io_uring->in_idle); + if (ctx->sq_data) io_sq_thread_unpark(ctx->sq_data); - } } /* @@ -9070,7 +8746,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) int ret; if (unlikely(!tctx)) { - ret = io_uring_alloc_task_context(current); + ret = io_uring_alloc_task_context(current, ctx); if (unlikely(ret)) return ret; tctx = current->io_uring; @@ -9086,10 +8762,6 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) fput(file); return ret; } - - /* one and only SQPOLL file note, held by sqo_task */ - WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && - current != ctx->sqo_task); } tctx->last = file; } @@ -9119,13 +8791,17 @@ static void io_uring_del_task_file(struct file *file) fput(file); } -static void io_uring_remove_task_files(struct io_uring_task *tctx) +static void io_uring_clean_tctx(struct io_uring_task *tctx) { struct file *file; unsigned long index; xa_for_each(&tctx->xa, index, file) io_uring_del_task_file(file); + if (tctx->io_wq) { + io_wq_put_and_exit(tctx->io_wq); + tctx->io_wq = NULL; + } } void __io_uring_files_cancel(struct files_struct *files) @@ -9141,7 +8817,7 @@ void __io_uring_files_cancel(struct files_struct *files) atomic_dec(&tctx->in_idle); if (files) - io_uring_remove_task_files(tctx); + io_uring_clean_tctx(tctx); } static s64 tctx_inflight(struct io_uring_task *tctx) @@ -9151,15 +8827,19 @@ static s64 tctx_inflight(struct io_uring_task *tctx) static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) { + struct io_sq_data *sqd = ctx->sq_data; struct io_uring_task *tctx; s64 inflight; DEFINE_WAIT(wait); - if (!ctx->sq_data) + if (!sqd) + return; + io_sq_thread_park(sqd); + if (!sqd->thread || !sqd->thread->io_uring) { + io_sq_thread_unpark(sqd); return; + } tctx = ctx->sq_data->thread->io_uring; - io_disable_sqo_submit(ctx); - atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ @@ -9179,6 +8859,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) finish_wait(&tctx->wait, &wait); } while (1); atomic_dec(&tctx->in_idle); + io_sq_thread_unpark(sqd); } /* @@ -9194,7 +8875,6 @@ void __io_uring_task_cancel(void) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); - /* trigger io_disable_sqo_submit() */ if (tctx->sqpoll) { struct file *file; unsigned long index; @@ -9224,47 +8904,9 @@ void __io_uring_task_cancel(void) atomic_dec(&tctx->in_idle); - io_uring_remove_task_files(tctx); -} - -static int io_uring_flush(struct file *file, void *data) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_ring_ctx *ctx = file->private_data; - - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { - io_uring_cancel_task_requests(ctx, NULL); - io_req_caches_free(ctx, current); - } - - if (!tctx) - return 0; - - /* we should have cancelled and erased it before PF_EXITING */ - WARN_ON_ONCE((current->flags & PF_EXITING) && - xa_load(&tctx->xa, (unsigned long)file)); - - /* - * fput() is pending, will be 2 if the only other ref is our potential - * task file note. If the task is exiting, drop regardless of count. - */ - if (atomic_long_read(&file->f_count) != 2) - return 0; - - if (ctx->flags & IORING_SETUP_SQPOLL) { - /* there is only one file note, which is owned by sqo_task */ - WARN_ON_ONCE(ctx->sqo_task != current && - xa_load(&tctx->xa, (unsigned long)file)); - /* sqo_dead check is for when this happens after cancellation */ - WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead && - !xa_load(&tctx->xa, (unsigned long)file)); - - io_disable_sqo_submit(ctx); - } - - if (!(ctx->flags & IORING_SETUP_SQPOLL) || ctx->sqo_task == current) - io_uring_del_task_file(file); - return 0; + io_uring_clean_tctx(tctx); + /* all current's requests should be gone, we can kill tctx */ + __io_uring_free(current); } static void *io_uring_validate_mmap_request(struct file *file, @@ -9345,22 +8987,14 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) do { if (!io_sqring_full(ctx)) break; - prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); - if (unlikely(ctx->sqo_dead)) { - ret = -EOWNERDEAD; - goto out; - } - if (!io_sqring_full(ctx)) break; - schedule(); } while (!signal_pending(current)); finish_wait(&ctx->sqo_sq_wait, &wait); -out: return ret; } @@ -9435,9 +9069,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx, false, NULL, NULL); + if (unlikely(ctx->sqo_exec)) { + ret = io_sq_thread_fork(ctx->sq_data, ctx); + if (ret) + goto out; + ctx->sqo_exec = 0; + } ret = -EOWNERDEAD; - if (unlikely(ctx->sqo_dead)) - goto out; if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) { @@ -9491,8 +9129,7 @@ out_fput: #ifdef CONFIG_PROC_FS static int io_uring_show_cred(int id, void *p, void *data) { - struct io_identity *iod = p; - const struct cred *cred = iod->creds; + const struct cred *cred = p; struct seq_file *m = data; struct user_namespace *uns = seq_user_ns(m); struct group_info *gi; @@ -9537,8 +9174,11 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) */ has_lock = mutex_trylock(&ctx->uring_lock); - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) + if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { sq = ctx->sq_data; + if (!sq->thread) + sq = NULL; + } seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); @@ -9590,7 +9230,6 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) static const struct file_operations io_uring_fops = { .release = io_uring_release, - .flush = io_uring_flush, .mmap = io_uring_mmap, #ifndef CONFIG_MMU .get_unmapped_area = io_uring_nommu_get_unmapped_area, @@ -9698,7 +9337,6 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) static int io_uring_create(unsigned entries, struct io_uring_params *p, struct io_uring_params __user *params) { - struct user_struct *user = NULL; struct io_ring_ctx *ctx; struct file *file; int ret; @@ -9740,22 +9378,12 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_entries = 2 * p->sq_entries; } - user = get_uid(current_user()); - ctx = io_ring_ctx_alloc(p); - if (!ctx) { - free_uid(user); + if (!ctx) return -ENOMEM; - } ctx->compat = in_compat_syscall(); - ctx->limit_mem = !capable(CAP_IPC_LOCK); - ctx->user = user; - ctx->creds = get_current_cred(); -#ifdef CONFIG_AUDIT - ctx->loginuid = current->loginuid; - ctx->sessionid = current->sessionid; -#endif - ctx->sqo_task = get_task_struct(current); + if (!capable(CAP_IPC_LOCK)) + ctx->user = get_uid(current_user()); /* * This is just grabbed for accounting purposes. When a process exits, @@ -9766,24 +9394,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, mmgrab(current->mm); ctx->mm_account = current->mm; -#ifdef CONFIG_BLK_CGROUP - /* - * The sq thread will belong to the original cgroup it was inited in. - * If the cgroup goes offline (e.g. disabling the io controller), then - * issued bios will be associated with the closest cgroup later in the - * block layer. - */ - rcu_read_lock(); - ctx->sqo_blkcg_css = blkcg_css(); - ret = css_tryget_online(ctx->sqo_blkcg_css); - rcu_read_unlock(); - if (!ret) { - /* don't init against a dying cgroup, have the user try again */ - ctx->sqo_blkcg_css = NULL; - ret = -ENODEV; - goto err; - } -#endif ret = io_allocate_scq_urings(ctx, p); if (ret) goto err; @@ -9817,7 +9427,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG; + IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -9836,7 +9446,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, */ ret = io_uring_install_fd(ctx, file); if (ret < 0) { - io_disable_sqo_submit(ctx); /* fput will clean it up */ fput(file); return ret; @@ -9845,7 +9454,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: - io_disable_sqo_submit(ctx); io_ring_ctx_wait_and_kill(ctx); return ret; } @@ -9923,21 +9531,15 @@ out: static int io_register_personality(struct io_ring_ctx *ctx) { - struct io_identity *id; + const struct cred *creds; int ret; - id = kmalloc(sizeof(*id), GFP_KERNEL); - if (unlikely(!id)) - return -ENOMEM; - - io_init_identity(id); - id->creds = get_current_cred(); + creds = get_current_cred(); - ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL); - if (ret < 0) { - put_cred(id->creds); - kfree(id); - } + ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, + USHRT_MAX, GFP_KERNEL); + if (ret < 0) + put_cred(creds); return ret; } @@ -10019,10 +9621,7 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) if (ctx->restrictions.registered) ctx->restricted = 1; - ctx->flags &= ~IORING_SETUP_R_DISABLED; - io_sq_offload_start(ctx); - return 0; } @@ -10196,6 +9795,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, ctx = f.file->private_data; + io_run_task_work(); + mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 16a1e82e3aeb..7ffcd7ef33d4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -278,14 +278,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (!is_contig || bio_full(ctx->bio, plen)) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; - int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); if (ctx->bio) submit_bio(ctx->bio); if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; - ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); + ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs)); /* * If the bio_alloc fails, try it again for a single page to * avoid having to deal with partial page reads. This emulates @@ -1459,13 +1459,6 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) goto redirty; /* - * Given that we do not allow direct reclaim to call us, we should - * never be called in a recursive filesystem reclaim context. - */ - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) - goto redirty; - - /* * Is this page beyond the end of the file? * * The page index is less than the end_index, adjust the end_offset diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 107ee80c3568..dab1b02eba5b 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -10,122 +10,17 @@ #include <linux/pagemap.h> #include <linux/pagevec.h> -/* - * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. - * Returns true if found and updates @lastoff to the offset in file. - */ -static bool -page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, - int whence) -{ - const struct address_space_operations *ops = inode->i_mapping->a_ops; - unsigned int bsize = i_blocksize(inode), off; - bool seek_data = whence == SEEK_DATA; - loff_t poff = page_offset(page); - - if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) - return false; - - if (*lastoff < poff) { - /* - * Last offset smaller than the start of the page means we found - * a hole: - */ - if (whence == SEEK_HOLE) - return true; - *lastoff = poff; - } - - /* - * Just check the page unless we can and should check block ranges: - */ - if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) - return PageUptodate(page) == seek_data; - - lock_page(page); - if (unlikely(page->mapping != inode->i_mapping)) - goto out_unlock_not_found; - - for (off = 0; off < PAGE_SIZE; off += bsize) { - if (offset_in_page(*lastoff) >= off + bsize) - continue; - if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { - unlock_page(page); - return true; - } - *lastoff = poff + off + bsize; - } - -out_unlock_not_found: - unlock_page(page); - return false; -} - -/* - * Seek for SEEK_DATA / SEEK_HOLE in the page cache. - * - * Within unwritten extents, the page cache determines which parts are holes - * and which are data: uptodate buffer heads count as data; everything else - * counts as a hole. - * - * Returns the resulting offset on successs, and -ENOENT otherwise. - */ static loff_t -page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, - int whence) -{ - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); - loff_t lastoff = offset; - struct pagevec pvec; - - if (length <= 0) - return -ENOENT; - - pagevec_init(&pvec); - - do { - unsigned nr_pages, i; - - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, - end - 1); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - if (page_seek_hole_data(inode, page, &lastoff, whence)) - goto check_range; - lastoff = page_offset(page) + PAGE_SIZE; - } - pagevec_release(&pvec); - } while (index < end); - - /* When no page at lastoff and we are not done, we found a hole. */ - if (whence != SEEK_HOLE) - goto not_found; - -check_range: - if (lastoff < offset + length) - goto out; -not_found: - lastoff = -ENOENT; -out: - pagevec_release(&pvec); - return lastoff; -} - - -static loff_t -iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, +iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { + loff_t offset = start; + switch (iomap->type) { case IOMAP_UNWRITTEN: - offset = page_cache_seek_hole_data(inode, offset, length, - SEEK_HOLE); - if (offset < 0) + offset = mapping_seek_hole_data(inode->i_mapping, start, + start + length, SEEK_HOLE); + if (offset == start + length) return length; fallthrough; case IOMAP_HOLE: @@ -164,15 +59,17 @@ iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) EXPORT_SYMBOL_GPL(iomap_seek_hole); static loff_t -iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, +iomap_seek_data_actor(struct inode *inode, loff_t start, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { + loff_t offset = start; + switch (iomap->type) { case IOMAP_HOLE: return length; case IOMAP_UNWRITTEN: - offset = page_cache_seek_hole_data(inode, offset, length, - SEEK_DATA); + offset = mapping_seek_hole_data(inode->i_mapping, start, + start + length, SEEK_DATA); if (offset < 0) return length; fallthrough; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 093ffbd82395..55a79df70d24 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -226,7 +226,8 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a return rc; } -int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int rc, xprefix; @@ -236,7 +237,8 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (acl) { umode_t mode; - rc = posix_acl_update_mode(inode, &mode, &acl); + rc = posix_acl_update_mode(&init_user_ns, inode, &mode, + &acl); if (rc) return rc; if (inode->i_mode != mode) { diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 12d0271bdde3..62c50da9d493 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -28,7 +28,8 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL struct posix_acl *jffs2_get_acl(struct inode *inode, int type); -int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 776493713153..c0aabbcbfd58 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -24,18 +24,21 @@ static int jffs2_readdir (struct file *, struct dir_context *); -static int jffs2_create (struct inode *,struct dentry *,umode_t, - bool); +static int jffs2_create (struct user_namespace *, struct inode *, + struct dentry *, umode_t, bool); static struct dentry *jffs2_lookup (struct inode *,struct dentry *, unsigned int); static int jffs2_link (struct dentry *,struct inode *,struct dentry *); static int jffs2_unlink (struct inode *,struct dentry *); -static int jffs2_symlink (struct inode *,struct dentry *,const char *); -static int jffs2_mkdir (struct inode *,struct dentry *,umode_t); +static int jffs2_symlink (struct user_namespace *, struct inode *, + struct dentry *, const char *); +static int jffs2_mkdir (struct user_namespace *, struct inode *,struct dentry *, + umode_t); static int jffs2_rmdir (struct inode *,struct dentry *); -static int jffs2_mknod (struct inode *,struct dentry *,umode_t,dev_t); -static int jffs2_rename (struct inode *, struct dentry *, - struct inode *, struct dentry *, +static int jffs2_mknod (struct user_namespace *, struct inode *,struct dentry *, + umode_t,dev_t); +static int jffs2_rename (struct user_namespace *, struct inode *, + struct dentry *, struct inode *, struct dentry *, unsigned int); const struct file_operations jffs2_dir_operations = @@ -157,8 +160,8 @@ static int jffs2_readdir(struct file *file, struct dir_context *ctx) /***********************************************************************/ -static int jffs2_create(struct inode *dir_i, struct dentry *dentry, - umode_t mode, bool excl) +static int jffs2_create(struct user_namespace *mnt_userns, struct inode *dir_i, + struct dentry *dentry, umode_t mode, bool excl) { struct jffs2_raw_inode *ri; struct jffs2_inode_info *f, *dir_f; @@ -276,7 +279,8 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de /***********************************************************************/ -static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char *target) +static int jffs2_symlink (struct user_namespace *mnt_userns, struct inode *dir_i, + struct dentry *dentry, const char *target) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; @@ -438,7 +442,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char } -static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode) +static int jffs2_mkdir (struct user_namespace *mnt_userns, struct inode *dir_i, + struct dentry *dentry, umode_t mode) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; @@ -609,7 +614,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) return ret; } -static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev) +static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; @@ -756,7 +762,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode return ret; } -static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, +static int jffs2_rename (struct user_namespace *mnt_userns, + struct inode *old_dir_i, struct dentry *old_dentry, struct inode *new_dir_i, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 78858f6e9583..2ac410477c4f 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -190,18 +190,19 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) return 0; } -int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) +int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; - rc = setattr_prepare(dentry, iattr); + rc = setattr_prepare(&init_user_ns, dentry, iattr); if (rc) return rc; rc = jffs2_do_setattr(inode, iattr); if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(inode, inode->i_mode); + rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); return rc; } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index ef1cfa61549e..173eccac691d 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -164,7 +164,7 @@ long jffs2_ioctl(struct file *, unsigned int, unsigned long); extern const struct inode_operations jffs2_symlink_inode_operations; /* fs.c */ -int jffs2_setattr (struct dentry *, struct iattr *); +int jffs2_setattr (struct user_namespace *, struct dentry *, struct iattr *); int jffs2_do_setattr (struct inode *, struct iattr *); struct inode *jffs2_iget(struct super_block *, unsigned long); void jffs2_evict_inode (struct inode *); diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index c2332e30f218..aef5522551db 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -57,6 +57,7 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler, } static int jffs2_security_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 5d6030826c52..cc3f24883e7d 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -25,6 +25,7 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler, } static int jffs2_trusted_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 9d027b4abcf9..fb945977c013 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -25,6 +25,7 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler, } static int jffs2_user_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 92cc0ac2d1fc..43c285c3d2a7 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -91,7 +91,8 @@ out: return rc; } -int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int rc; tid_t tid; @@ -101,7 +102,7 @@ int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) tid = txBegin(inode->i_sb, 0); mutex_lock(&JFS_IP(inode)->commit_mutex); if (type == ACL_TYPE_ACCESS && acl) { - rc = posix_acl_update_mode(inode, &mode, &acl); + rc = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl); if (rc) goto end_tx; if (mode != inode->i_mode) diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 930d2701f206..28b70e7c7dd4 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -85,12 +85,13 @@ static int jfs_release(struct inode *inode, struct file *file) return 0; } -int jfs_setattr(struct dentry *dentry, struct iattr *iattr) +int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; - rc = setattr_prepare(dentry, iattr); + rc = setattr_prepare(&init_user_ns, dentry, iattr); if (rc) return rc; @@ -118,11 +119,11 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) jfs_truncate(inode); } - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) - rc = posix_acl_chmod(inode, inode->i_mode); + rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); return rc; } diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 10ee0ecca1a8..2581d4db58ff 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -76,7 +76,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) return err; - if (!inode_owner_or_capable(inode)) { + if (!inode_owner_or_capable(&init_user_ns, inode)) { err = -EACCES; goto setflags_out; } diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 9f8f92dd6f84..7ae389a7a366 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -8,7 +8,8 @@ #ifdef CONFIG_JFS_POSIX_ACL struct posix_acl *jfs_get_acl(struct inode *inode, int type); -int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); #else diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 4cef170630db..59379089e939 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -64,7 +64,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) goto fail_put; } - inode_init_owner(inode, parent, mode); + inode_init_owner(&init_user_ns, inode, parent, mode); /* * New inodes need to save sane values on disk when * uid & gid mount options are used diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 70a0d12e427e..01daa0cb0ae5 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -26,7 +26,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern void jfs_set_inode_flags(struct inode *); extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern int jfs_setattr(struct dentry *, struct iattr *); +extern int jfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); extern const struct address_space_operations jfs_aops; extern const struct inode_operations jfs_dir_inode_operations; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 7a55d14cc1af..9abed0d750e5 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -59,8 +59,8 @@ static inline void free_ea_wmap(struct inode *inode) * RETURN: Errors from subroutines * */ -static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, - bool excl) +static int jfs_create(struct user_namespace *mnt_userns, struct inode *dip, + struct dentry *dentry, umode_t mode, bool excl) { int rc = 0; tid_t tid; /* transaction id */ @@ -192,7 +192,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, * note: * EACCES: user needs search+write permission on the parent directory */ -static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) +static int jfs_mkdir(struct user_namespace *mnt_userns, struct inode *dip, + struct dentry *dentry, umode_t mode) { int rc = 0; tid_t tid; /* transaction id */ @@ -868,8 +869,8 @@ static int jfs_link(struct dentry *old_dentry, * an intermediate result whose length exceeds PATH_MAX [XPG4.2] */ -static int jfs_symlink(struct inode *dip, struct dentry *dentry, - const char *name) +static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip, + struct dentry *dentry, const char *name) { int rc; tid_t tid; @@ -1058,9 +1059,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, * * FUNCTION: rename a file or directory */ -static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int jfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct btstack btstack; ino_t ino; @@ -1344,8 +1345,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, * * FUNCTION: Create a special file (device) */ -static int jfs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int jfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct jfs_inode_info *jfs_ip; struct btstack btstack; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index b2dc4d1f9dcc..1f0ffabbde56 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -551,7 +551,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) ret = -ENOMEM; goto out_unload; } - inode->i_ino = 0; inode->i_size = i_size_read(sb->s_bdev->bd_inode); inode->i_mapping->a_ops = &jfs_metapage_aops; inode_fake_hash(inode); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index db41e7803163..f9273f6901c8 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -932,6 +932,7 @@ static int jfs_xattr_get(const struct xattr_handler *handler, } static int jfs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -950,6 +951,7 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler, } static int jfs_xattr_set_os2(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 7a53eed69fef..7e0e62deab53 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1110,7 +1110,8 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, return ret; } -static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, +static int kernfs_iop_mkdir(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode) { struct kernfs_node *parent = dir->i_private; @@ -1147,7 +1148,8 @@ static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) return ret; } -static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, +static int kernfs_iop_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index fc2469a20fed..d73950fc3d57 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -112,7 +112,8 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) return ret; } -int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) +int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct kernfs_node *kn = inode->i_private; @@ -122,7 +123,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) return -EINVAL; mutex_lock(&kernfs_mutex); - error = setattr_prepare(dentry, iattr); + error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) goto out; @@ -131,7 +132,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) goto out; /* this ignores size changes */ - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); out: mutex_unlock(&kernfs_mutex); @@ -183,7 +184,8 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) set_nlink(inode, kn->dir.subdirs + 2); } -int kernfs_iop_getattr(const struct path *path, struct kstat *stat, +int kernfs_iop_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -193,7 +195,7 @@ int kernfs_iop_getattr(const struct path *path, struct kstat *stat, kernfs_refresh_inode(kn, inode); mutex_unlock(&kernfs_mutex); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } @@ -272,7 +274,8 @@ void kernfs_evict_inode(struct inode *inode) kernfs_put(kn); } -int kernfs_iop_permission(struct inode *inode, int mask) +int kernfs_iop_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct kernfs_node *kn; @@ -285,7 +288,7 @@ int kernfs_iop_permission(struct inode *inode, int mask) kernfs_refresh_inode(kn, inode); mutex_unlock(&kernfs_mutex); - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } int kernfs_xattr_get(struct kernfs_node *kn, const char *name, @@ -319,6 +322,7 @@ static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, } static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) @@ -385,6 +389,7 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 7ee97ef59184..ccc3b44f6306 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -89,9 +89,12 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; */ extern const struct xattr_handler *kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); -int kernfs_iop_permission(struct inode *inode, int mask); -int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); -int kernfs_iop_getattr(const struct path *path, struct kstat *stat, +int kernfs_iop_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); +int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr); +int kernfs_iop_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); diff --git a/fs/libfs.c b/fs/libfs.c index 1e551766bc52..e2de5401abca 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -27,11 +27,12 @@ #include "internal.h" -int simple_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int simple_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } @@ -447,9 +448,9 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL(simple_rmdir); -int simple_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = d_is_dir(old_dentry); @@ -492,18 +493,19 @@ EXPORT_SYMBOL(simple_rename); * on simple regular filesystems. Anything that needs to change on-disk * or wire state on size changes needs its own setattr method. */ -int simple_setattr(struct dentry *dentry, struct iattr *iattr) +int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, iattr); + error = setattr_prepare(mnt_userns, dentry, iattr); if (error) return error; if (iattr->ia_valid & ATTR_SIZE) truncate_setsize(inode, iattr->ia_size); - setattr_copy(inode, iattr); + setattr_copy(mnt_userns, inode, iattr); mark_inode_dirty(inode); return 0; } @@ -1295,15 +1297,17 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENOENT); } -static int empty_dir_getattr(const struct path *path, struct kstat *stat, +static int empty_dir_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } -static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr) +static int empty_dir_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { return -EPERM; } diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index f4e5e5181a14..9115948c624e 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -252,7 +252,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) iput(inode); return NULL; } - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = j; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; diff --git a/fs/minix/file.c b/fs/minix/file.c index c50b0a20fcd9..6a7bd2d9eec0 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -22,12 +22,13 @@ const struct file_operations minix_file_operations = { .splice_read = generic_file_splice_read, }; -static int minix_setattr(struct dentry *dentry, struct iattr *attr) +static int minix_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -41,7 +42,7 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr) minix_truncate(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 34f546404aa1..a532a99bbe81 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -652,13 +652,13 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } -int minix_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int minix_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *sb = path->dentry->d_sb; struct inode *inode = d_inode(path->dentry); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (INODE_VERSION(inode) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 168d45d3de73..202173368025 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -51,7 +51,8 @@ extern unsigned long minix_count_free_inodes(struct super_block *sb); extern int minix_new_block(struct inode * inode); extern void minix_free_block(struct inode *inode, unsigned long block); extern unsigned long minix_count_free_blocks(struct super_block *sb); -extern int minix_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int minix_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); extern void V1_minix_truncate(struct inode *); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 1a6084d2b02e..937fa5fae2b8 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -33,7 +33,8 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un return d_splice_alias(inode, dentry); } -static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) +static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { int error; struct inode *inode; @@ -51,7 +52,8 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, return error; } -static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; struct inode *inode = minix_new_inode(dir, mode, &error); @@ -63,14 +65,14 @@ static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return error; } -static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int minix_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return minix_mknod(dir, dentry, mode, 0); + return minix_mknod(mnt_userns, dir, dentry, mode, 0); } -static int minix_symlink(struct inode * dir, struct dentry *dentry, - const char * symname) +static int minix_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { int err = -ENAMETOOLONG; int i = strlen(symname)+1; @@ -109,7 +111,8 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode) +static int minix_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode * inode; int err; @@ -181,8 +184,9 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) return err; } -static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, - struct inode * new_dir, struct dentry *new_dentry, +static int minix_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); diff --git a/fs/mount.h b/fs/mount.h index ce6c376e0bc2..0b6e08cf8afb 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -124,16 +124,6 @@ static inline void get_mnt_ns(struct mnt_namespace *ns) extern seqlock_t mount_lock; -static inline void lock_mount_hash(void) -{ - write_seqlock(&mount_lock); -} - -static inline void unlock_mount_hash(void) -{ - write_sequnlock(&mount_lock); -} - struct proc_mounts { struct mnt_namespace *ns; struct path root; diff --git a/fs/mpage.c b/fs/mpage.c index 830e6cc2a9e7..961234d68779 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -304,9 +304,7 @@ alloc_new: goto out; } args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - min_t(int, args->nr_pages, - BIO_MAX_PAGES), - gfp); + bio_max_segs(args->nr_pages), gfp); if (args->bio == NULL) goto confused; } diff --git a/fs/namei.c b/fs/namei.c index de74ad2bc6e2..216f16e74351 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -259,7 +259,24 @@ void putname(struct filename *name) __putname(name); } -static int check_acl(struct inode *inode, int mask) +/** + * check_acl - perform ACL permission checking + * @mnt_userns: user namespace of the mount the inode was found from + * @inode: inode to check permissions on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) + * + * This function performs the ACL permission checking. Since this function + * retrieve POSIX acls it needs to know whether it is called from a blocking or + * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +static int check_acl(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; @@ -271,14 +288,14 @@ static int check_acl(struct inode *inode, int mask) /* no ->get_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; - return posix_acl_permission(inode, acl, mask); + return posix_acl_permission(mnt_userns, inode, acl, mask); } acl = get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { - int error = posix_acl_permission(inode, acl, mask); + int error = posix_acl_permission(mnt_userns, inode, acl, mask); posix_acl_release(acl); return error; } @@ -287,18 +304,31 @@ static int check_acl(struct inode *inode, int mask) return -EAGAIN; } -/* - * This does the basic UNIX permission checking. +/** + * acl_permission_check - perform basic UNIX permission checking + * @mnt_userns: user namespace of the mount the inode was found from + * @inode: inode to check permissions on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) + * + * This function performs the basic UNIX permission checking. Since this + * function may retrieve POSIX acls it needs to know whether it is called from a + * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * - * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit, - * for RCU walking. + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. */ -static int acl_permission_check(struct inode *inode, int mask) +static int acl_permission_check(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { unsigned int mode = inode->i_mode; + kuid_t i_uid; /* Are we the owner? If so, ACL's don't matter */ - if (likely(uid_eq(current_fsuid(), inode->i_uid))) { + i_uid = i_uid_into_mnt(mnt_userns, inode); + if (likely(uid_eq(current_fsuid(), i_uid))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; @@ -306,7 +336,7 @@ static int acl_permission_check(struct inode *inode, int mask) /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { - int error = check_acl(inode, mask); + int error = check_acl(mnt_userns, inode, mask); if (error != -EAGAIN) return error; } @@ -320,7 +350,8 @@ static int acl_permission_check(struct inode *inode, int mask) * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { - if (in_group_p(inode->i_gid)) + kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); + if (in_group_p(kgid)) mode >>= 3; } @@ -330,6 +361,7 @@ static int acl_permission_check(struct inode *inode, int mask) /** * generic_permission - check for access rights on a Posix-like filesystem + * @mnt_userns: user namespace of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) @@ -342,25 +374,33 @@ static int acl_permission_check(struct inode *inode, int mask) * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. */ -int generic_permission(struct inode *inode, int mask) +int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { int ret; /* * Do the basic permission checks. */ - ret = acl_permission_check(inode, mask); + ret = acl_permission_check(mnt_userns, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) - if (capable_wrt_inode_uidgid(inode, + if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_DAC_READ_SEARCH)) return 0; - if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(mnt_userns, inode, + CAP_DAC_OVERRIDE)) return 0; return -EACCES; } @@ -370,7 +410,8 @@ int generic_permission(struct inode *inode, int mask) */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(mnt_userns, inode, + CAP_DAC_READ_SEARCH)) return 0; /* * Read/write DACs are always overridable. @@ -378,31 +419,38 @@ int generic_permission(struct inode *inode, int mask) * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(mnt_userns, inode, + CAP_DAC_OVERRIDE)) return 0; return -EACCES; } EXPORT_SYMBOL(generic_permission); -/* +/** + * do_inode_permission - UNIX permission checking + * @mnt_userns: user namespace of the mount the inode was found from + * @inode: inode to check permissions on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) + * * We _really_ want to just do "generic_permission()" without * even looking at the inode->i_op values. So we keep a cache * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ -static inline int do_inode_permission(struct inode *inode, int mask) +static inline int do_inode_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) - return inode->i_op->permission(inode, mask); + return inode->i_op->permission(mnt_userns, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } - return generic_permission(inode, mask); + return generic_permission(mnt_userns, inode, mask); } /** @@ -427,8 +475,9 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) /** * inode_permission - Check for access rights to a given inode - * @inode: Inode to check permission on - * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @mnt_userns: User namespace of the mount the inode was found from + * @inode: Inode to check permission on + * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Check for read/write/execute permissions on an inode. We use fs[ug]id for * this, letting us set arbitrary permissions for filesystem access without @@ -436,7 +485,8 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ -int inode_permission(struct inode *inode, int mask) +int inode_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { int retval; @@ -456,11 +506,11 @@ int inode_permission(struct inode *inode, int mask) * written back improperly if their true value is unknown * to the vfs. */ - if (HAS_UNMAPPED_ID(inode)) + if (HAS_UNMAPPED_ID(mnt_userns, inode)) return -EACCES; } - retval = do_inode_permission(inode, mask); + retval = do_inode_permission(mnt_userns, inode, mask); if (retval) return retval; @@ -960,11 +1010,16 @@ int sysctl_protected_regular __read_mostly; */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { + struct user_namespace *mnt_userns; + kuid_t i_uid; + if (!sysctl_protected_symlinks) return 0; + mnt_userns = mnt_user_ns(nd->path.mnt); + i_uid = i_uid_into_mnt(mnt_userns, inode); /* Allowed if owner and follower match. */ - if (uid_eq(current_cred()->fsuid, inode->i_uid)) + if (uid_eq(current_cred()->fsuid, i_uid)) return 0; /* Allowed if parent directory not sticky and world-writable. */ @@ -972,7 +1027,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod return 0; /* Allowed if parent directory and link owner match. */ - if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid)) + if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid)) return 0; if (nd->flags & LOOKUP_RCU) @@ -985,6 +1040,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod /** * safe_hardlink_source - Check for safe hardlink conditions + * @mnt_userns: user namespace of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: @@ -995,7 +1051,8 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod * * Otherwise returns true. */ -static bool safe_hardlink_source(struct inode *inode) +static bool safe_hardlink_source(struct user_namespace *mnt_userns, + struct inode *inode) { umode_t mode = inode->i_mode; @@ -1012,7 +1069,7 @@ static bool safe_hardlink_source(struct inode *inode) return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ - if (inode_permission(inode, MAY_READ | MAY_WRITE)) + if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE)) return false; return true; @@ -1020,6 +1077,7 @@ static bool safe_hardlink_source(struct inode *inode) /** * may_linkat - Check permissions for creating a hardlink + * @mnt_userns: user namespace of the mount the inode was found from * @link: the source to hardlink from * * Block hardlink when all of: @@ -1028,14 +1086,21 @@ static bool safe_hardlink_source(struct inode *inode) * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + * * Returns 0 if successful, -ve on error. */ -int may_linkat(struct path *link) +int may_linkat(struct user_namespace *mnt_userns, struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || + !gid_valid(i_gid_into_mnt(mnt_userns, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) @@ -1044,7 +1109,8 @@ int may_linkat(struct path *link) /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (safe_hardlink_source(inode) || inode_owner_or_capable(inode)) + if (safe_hardlink_source(mnt_userns, inode) || + inode_owner_or_capable(mnt_userns, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); @@ -1055,6 +1121,7 @@ int may_linkat(struct path *link) * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. + * @mnt_userns: user namespace of the mount the inode was found from * @dir_mode: mode bits of directory * @dir_uid: owner of directory * @inode: the inode of the file to open @@ -1070,16 +1137,25 @@ int may_linkat(struct path *link) * the directory doesn't have to be world writable: being group writable will * be enough. * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + * * Returns 0 if the open is allowed, -ve on error. */ -static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid, - struct inode * const inode) +static int may_create_in_sticky(struct user_namespace *mnt_userns, + struct nameidata *nd, struct inode *const inode) { + umode_t dir_mode = nd->dir_mode; + kuid_t dir_uid = nd->dir_uid; + if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || - uid_eq(inode->i_uid, dir_uid) || - uid_eq(current_fsuid(), inode->i_uid)) + uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) || + uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) return 0; if (likely(dir_mode & 0002) || @@ -1569,14 +1645,15 @@ static struct dentry *lookup_slow(const struct qstr *name, return res; } -static inline int may_lookup(struct nameidata *nd) +static inline int may_lookup(struct user_namespace *mnt_userns, + struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { - int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); + int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD || !try_to_unlazy(nd)) return err; } - return inode_permission(nd->inode, MAY_EXEC); + return inode_permission(mnt_userns, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) @@ -2122,11 +2199,13 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { + struct user_namespace *mnt_userns; const char *link; u64 hash_len; int type; - err = may_lookup(nd); + mnt_userns = mnt_user_ns(nd->path.mnt); + err = may_lookup(mnt_userns, nd); if (err) return err; @@ -2174,7 +2253,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) OK: /* pathname or trailing symlink, done */ if (!depth) { - nd->dir_uid = nd->inode->i_uid; + nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; @@ -2511,7 +2590,7 @@ static int lookup_one_len_common(const char *name, struct dentry *base, return err; } - return inode_permission(base->d_inode, MAY_EXEC); + return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC); } /** @@ -2656,15 +2735,16 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, } EXPORT_SYMBOL(user_path_at_empty); -int __check_sticky(struct inode *dir, struct inode *inode) +int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir, + struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (uid_eq(inode->i_uid, fsuid)) + if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid)) return 0; - if (uid_eq(dir->i_uid, fsuid)) + if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid)) return 0; - return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); @@ -2688,7 +2768,8 @@ EXPORT_SYMBOL(__check_sticky); * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) +static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); int error; @@ -2700,19 +2781,21 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || + !gid_valid(i_gid_into_mnt(mnt_userns, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, inode) || IS_APPEND(inode) || - IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode)) + if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) || + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || + HAS_UNMAPPED_ID(mnt_userns, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -2737,7 +2820,8 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ -static inline int may_create(struct inode *dir, struct dentry *child) +static inline int may_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *child) { struct user_namespace *s_user_ns; audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); @@ -2746,10 +2830,10 @@ static inline int may_create(struct inode *dir, struct dentry *child) if (IS_DEADDIR(dir)) return -ENOENT; s_user_ns = dir->i_sb->s_user_ns; - if (!kuid_has_mapping(s_user_ns, current_fsuid()) || - !kgid_has_mapping(s_user_ns, current_fsgid())) + if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) || + !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns))) return -EOVERFLOW; - return inode_permission(dir, MAY_WRITE | MAY_EXEC); + return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); } /* @@ -2796,10 +2880,26 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) } EXPORT_SYMBOL(unlock_rename); -int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool want_excl) +/** + * vfs_create - create new file + * @mnt_userns: user namespace of the mount the inode was found from + * @dir: inode of @dentry + * @dentry: pointer to dentry of the base directory + * @mode: mode of the new file + * @want_excl: whether the file must not yet exist + * + * Create a new file. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +int vfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool want_excl) { - int error = may_create(dir, dentry); + int error = may_create(mnt_userns, dir, dentry); if (error) return error; @@ -2810,7 +2910,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(dir, dentry, mode, want_excl); + error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; @@ -2822,7 +2922,7 @@ int vfs_mkobj(struct dentry *dentry, umode_t mode, void *arg) { struct inode *dir = dentry->d_parent->d_inode; - int error = may_create(dir, dentry); + int error = may_create(&init_user_ns, dir, dentry); if (error) return error; @@ -2844,7 +2944,8 @@ bool may_open_dev(const struct path *path) !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } -static int may_open(const struct path *path, int acc_mode, int flag) +static int may_open(struct user_namespace *mnt_userns, const struct path *path, + int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; @@ -2879,7 +2980,7 @@ static int may_open(const struct path *path, int acc_mode, int flag) break; } - error = inode_permission(inode, MAY_OPEN | acc_mode); + error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode); if (error) return error; @@ -2894,13 +2995,13 @@ static int may_open(const struct path *path, int acc_mode, int flag) } /* O_NOATIME can only be set by the owner or superuser */ - if (flag & O_NOATIME && !inode_owner_or_capable(inode)) + if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode)) return -EPERM; return 0; } -static int handle_truncate(struct file *filp) +static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; @@ -2914,7 +3015,7 @@ static int handle_truncate(struct file *filp) if (!error) error = security_path_truncate(path); if (!error) { - error = do_truncate(path->dentry, 0, + error = do_truncate(mnt_userns, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } @@ -2929,7 +3030,9 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode) +static int may_o_create(struct user_namespace *mnt_userns, + const struct path *dir, struct dentry *dentry, + umode_t mode) { struct user_namespace *s_user_ns; int error = security_path_mknod(dir, dentry, mode, 0); @@ -2937,11 +3040,12 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m return error; s_user_ns = dir->dentry->d_sb->s_user_ns; - if (!kuid_has_mapping(s_user_ns, current_fsuid()) || - !kgid_has_mapping(s_user_ns, current_fsgid())) + if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) || + !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns))) return -EOVERFLOW; - error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); + error = inode_permission(mnt_userns, dir->dentry->d_inode, + MAY_WRITE | MAY_EXEC); if (error) return error; @@ -3020,6 +3124,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { + struct user_namespace *mnt_userns; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; @@ -3067,13 +3172,15 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; + mnt_userns = mnt_user_ns(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; if (!IS_POSIXACL(dir->d_inode)) mode &= ~current_umask(); if (likely(got_write)) - create_error = may_o_create(&nd->path, dentry, mode); + create_error = may_o_create(mnt_userns, &nd->path, + dentry, mode); else create_error = -EROFS; } @@ -3108,8 +3215,9 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, error = -EACCES; goto out_dput; } - error = dir_inode->i_op->create(dir_inode, dentry, mode, - open_flag & O_EXCL); + + error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry, + mode, open_flag & O_EXCL); if (error) goto out_dput; } @@ -3213,6 +3321,7 @@ finish_lookup: static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { + struct user_namespace *mnt_userns; int open_flag = op->open_flag; bool do_truncate; int acc_mode; @@ -3225,12 +3334,13 @@ static int do_open(struct nameidata *nd, } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); + mnt_userns = mnt_user_ns(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; - error = may_create_in_sticky(nd->dir_mode, nd->dir_uid, + error = may_create_in_sticky(mnt_userns, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; @@ -3250,13 +3360,13 @@ static int do_open(struct nameidata *nd, return error; do_truncate = true; } - error = may_open(&nd->path, acc_mode, open_flag); + error = may_open(mnt_userns, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = ima_file_check(file, op->acc_mode); if (!error && do_truncate) - error = handle_truncate(file); + error = handle_truncate(mnt_userns, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; @@ -3266,7 +3376,23 @@ static int do_open(struct nameidata *nd, return error; } -struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) +/** + * vfs_tmpfile - create tmpfile + * @mnt_userns: user namespace of the mount the inode was found from + * @dentry: pointer to dentry of the base directory + * @mode: mode of the new tmpfile + * @open_flags: flags + * + * Create a temporary file. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, + struct dentry *dentry, umode_t mode, int open_flag) { struct dentry *child = NULL; struct inode *dir = dentry->d_inode; @@ -3274,7 +3400,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) int error; /* we want directory to be writable */ - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); if (error) goto out_err; error = -EOPNOTSUPP; @@ -3284,7 +3410,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) child = d_alloc(dentry, &slash_name); if (unlikely(!child)) goto out_err; - error = dir->i_op->tmpfile(dir, child, mode); + error = dir->i_op->tmpfile(mnt_userns, dir, child, mode); if (error) goto out_err; error = -ENOENT; @@ -3296,7 +3422,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } - ima_post_create_tmpfile(inode); + ima_post_create_tmpfile(mnt_userns, inode); return child; out_err: @@ -3309,6 +3435,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { + struct user_namespace *mnt_userns; struct dentry *child; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); @@ -3317,7 +3444,8 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; - child = vfs_tmpfile(path.dentry, op->mode, op->open_flag); + mnt_userns = mnt_user_ns(path.mnt); + child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag); error = PTR_ERR(child); if (IS_ERR(child)) goto out2; @@ -3325,7 +3453,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, path.dentry = child; audit_inode(nd->name, child, 0); /* Don't check for other permissions, the inode was just created */ - error = may_open(&path, 0, op->open_flag); + error = may_open(mnt_userns, &path, 0, op->open_flag); if (!error) error = vfs_open(&path, file); out2: @@ -3527,10 +3655,27 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname, } EXPORT_SYMBOL(user_path_create); -int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +/** + * vfs_mknod - create device node or file + * @mnt_userns: user namespace of the mount the inode was found from + * @dir: inode of @dentry + * @dentry: pointer to dentry of the base directory + * @mode: mode of the new device node or file + * @dev: device number of device to create + * + * Create a device node or file. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; - int error = may_create(dir, dentry); + int error = may_create(mnt_userns, dir, dentry); if (error) return error; @@ -3550,7 +3695,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - error = dir->i_op->mknod(dir, dentry, mode, dev); + error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; @@ -3577,6 +3722,7 @@ static int may_mknod(umode_t mode) static long do_mknodat(int dfd, const char __user *filename, umode_t mode, unsigned int dev) { + struct user_namespace *mnt_userns; struct dentry *dentry; struct path path; int error; @@ -3595,18 +3741,22 @@ retry: error = security_path_mknod(&path, dentry, mode, dev); if (error) goto out; + + mnt_userns = mnt_user_ns(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(path.dentry->d_inode,dentry,mode,true); + error = vfs_create(mnt_userns, path.dentry->d_inode, + dentry, mode, true); if (!error) - ima_post_path_mknod(dentry); + ima_post_path_mknod(mnt_userns, dentry); break; case S_IFCHR: case S_IFBLK: - error = vfs_mknod(path.dentry->d_inode,dentry,mode, - new_decode_dev(dev)); + error = vfs_mknod(mnt_userns, path.dentry->d_inode, + dentry, mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: - error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); + error = vfs_mknod(mnt_userns, path.dentry->d_inode, + dentry, mode, 0); break; } out: @@ -3629,9 +3779,25 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d return do_mknodat(AT_FDCWD, filename, mode, dev); } -int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +/** + * vfs_mkdir - create directory + * @mnt_userns: user namespace of the mount the inode was found from + * @dir: inode of @dentry + * @dentry: pointer to dentry of the base directory + * @mode: mode of the new directory + * + * Create a directory. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { - int error = may_create(dir, dentry); + int error = may_create(mnt_userns, dir, dentry); unsigned max_links = dir->i_sb->s_max_links; if (error) @@ -3648,7 +3814,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (max_links && dir->i_nlink >= max_links) return -EMLINK; - error = dir->i_op->mkdir(dir, dentry, mode); + error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); return error; @@ -3670,8 +3836,12 @@ retry: if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); error = security_path_mkdir(&path, dentry, mode); - if (!error) - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); + if (!error) { + struct user_namespace *mnt_userns; + mnt_userns = mnt_user_ns(path.mnt); + error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry, + mode); + } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -3690,9 +3860,24 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) return do_mkdirat(AT_FDCWD, pathname, mode); } -int vfs_rmdir(struct inode *dir, struct dentry *dentry) +/** + * vfs_rmdir - remove directory + * @mnt_userns: user namespace of the mount the inode was found from + * @dir: inode of @dentry + * @dentry: pointer to dentry of the base directory + * + * Remove a directory. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry) { - int error = may_delete(dir, dentry, 1); + int error = may_delete(mnt_userns, dir, dentry, 1); if (error) return error; @@ -3732,6 +3917,7 @@ EXPORT_SYMBOL(vfs_rmdir); long do_rmdir(int dfd, struct filename *name) { + struct user_namespace *mnt_userns; int error = 0; struct dentry *dentry; struct path path; @@ -3772,7 +3958,8 @@ retry: error = security_path_rmdir(&path, dentry); if (error) goto exit3; - error = vfs_rmdir(path.dentry->d_inode, dentry); + mnt_userns = mnt_user_ns(path.mnt); + error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry); exit3: dput(dentry); exit2: @@ -3795,6 +3982,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) /** * vfs_unlink - unlink a filesystem object + * @mnt_userns: user namespace of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. @@ -3810,11 +3998,18 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. */ -int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) +int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; - int error = may_delete(dir, dentry, 0); + int error = may_delete(mnt_userns, dir, dentry, 0); if (error) return error; @@ -3885,6 +4080,8 @@ retry_deleg: dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { + struct user_namespace *mnt_userns; + /* Why not before? Because we want correct error value */ if (last.name[last.len]) goto slashes; @@ -3895,7 +4092,9 @@ retry_deleg: error = security_path_unlink(&path, dentry); if (error) goto exit2; - error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode); + mnt_userns = mnt_user_ns(path.mnt); + error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, + &delegated_inode); exit2: dput(dentry); } @@ -3944,9 +4143,25 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) return do_unlinkat(AT_FDCWD, getname(pathname)); } -int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) +/** + * vfs_symlink - create symlink + * @mnt_userns: user namespace of the mount the inode was found from + * @dir: inode of @dentry + * @dentry: pointer to dentry of the base directory + * @oldname: name of the file to link to + * + * Create a symlink. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *oldname) { - int error = may_create(dir, dentry); + int error = may_create(mnt_userns, dir, dentry); if (error) return error; @@ -3958,7 +4173,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) if (error) return error; - error = dir->i_op->symlink(dir, dentry, oldname); + error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; @@ -3984,8 +4199,13 @@ retry: goto out_putname; error = security_path_symlink(&path, dentry, from->name); - if (!error) - error = vfs_symlink(path.dentry->d_inode, dentry, from->name); + if (!error) { + struct user_namespace *mnt_userns; + + mnt_userns = mnt_user_ns(path.mnt); + error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry, + from->name); + } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -4010,6 +4230,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn /** * vfs_link - create a new link * @old_dentry: object to be linked + * @mnt_userns: the user namespace of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break @@ -4025,8 +4246,16 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. */ -int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) +int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *new_dentry, + struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -4035,7 +4264,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (!inode) return -ENOENT; - error = may_create(dir, new_dentry); + error = may_create(mnt_userns, dir, new_dentry); if (error) return error; @@ -4052,7 +4281,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de * be writen back improperly if their true value is unknown to * the vfs. */ - if (HAS_UNMAPPED_ID(inode)) + if (HAS_UNMAPPED_ID(mnt_userns, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; @@ -4099,6 +4328,7 @@ EXPORT_SYMBOL(vfs_link); static int do_linkat(int olddfd, const char __user *oldname, int newdfd, const char __user *newname, int flags) { + struct user_namespace *mnt_userns; struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; @@ -4134,13 +4364,15 @@ retry: error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - error = may_linkat(&old_path); + mnt_userns = mnt_user_ns(new_path.mnt); + error = may_linkat(mnt_userns, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode); + error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode, + new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); if (delegated_inode) { @@ -4174,12 +4406,14 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname /** * vfs_rename - rename a filesystem object - * @old_dir: parent of source - * @old_dentry: source - * @new_dir: parent of destination - * @new_dentry: destination - * @delegated_inode: returns an inode needing a delegation break - * @flags: rename flags + * @old_mnt_userns: old user namespace of the mount the inode was found from + * @old_dir: parent of source + * @old_dentry: source + * @new_mnt_userns: new user namespace of the mount the inode was found from + * @new_dir: parent of destination + * @new_dentry: destination + * @delegated_inode: returns an inode needing a delegation break + * @flags: rename flags * * The caller must hold multiple mutexes--see lock_rename()). * @@ -4222,11 +4456,14 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ -int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - struct inode **delegated_inode, unsigned int flags) +int vfs_rename(struct renamedata *rd) { int error; + struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; + struct dentry *old_dentry = rd->old_dentry; + struct dentry *new_dentry = rd->new_dentry; + struct inode **delegated_inode = rd->delegated_inode; + unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; @@ -4237,19 +4474,21 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (source == target) return 0; - error = may_delete(old_dir, old_dentry, is_dir); + error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(new_dir, new_dentry); + error = may_create(rd->new_mnt_userns, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(new_dir, new_dentry, is_dir); + error = may_delete(rd->new_mnt_userns, new_dir, + new_dentry, is_dir); else - error = may_delete(new_dir, new_dentry, new_is_dir); + error = may_delete(rd->new_mnt_userns, new_dir, + new_dentry, new_is_dir); } if (error) return error; @@ -4263,12 +4502,14 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(source, MAY_WRITE); + error = inode_permission(rd->old_mnt_userns, source, + MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(target, MAY_WRITE); + error = inode_permission(rd->new_mnt_userns, target, + MAY_WRITE); if (error) return error; } @@ -4308,8 +4549,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) goto out; } - error = old_dir->i_op->rename(old_dir, old_dentry, - new_dir, new_dentry, flags); + error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry, + new_dir, new_dentry, flags); if (error) goto out; @@ -4350,6 +4591,7 @@ EXPORT_SYMBOL(vfs_rename); int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { + struct renamedata rd; struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct path old_path, new_path; @@ -4453,9 +4695,16 @@ retry_deleg: &new_path, new_dentry, flags); if (error) goto exit5; - error = vfs_rename(old_path.dentry->d_inode, old_dentry, - new_path.dentry->d_inode, new_dentry, - &delegated_inode, flags); + + rd.old_dir = old_path.dentry->d_inode; + rd.old_dentry = old_dentry; + rd.old_mnt_userns = mnt_user_ns(old_path.mnt); + rd.new_dir = new_path.dentry->d_inode; + rd.new_dentry = new_dentry; + rd.new_mnt_userns = mnt_user_ns(new_path.mnt); + rd.delegated_inode = &delegated_inode; + rd.flags = flags; + error = vfs_rename(&rd); exit5: dput(new_dentry); exit4: diff --git a/fs/namespace.c b/fs/namespace.c index 9d33909d0f9e..56bb5a5fdc0d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -25,6 +25,7 @@ #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/memblock.h> +#include <linux/proc_fs.h> #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> @@ -73,6 +74,15 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +struct mount_kattr { + unsigned int attr_set; + unsigned int attr_clr; + unsigned int propagation; + unsigned int lookup_flags; + bool recurse; + struct user_namespace *mnt_userns; +}; + /* /sys/fs */ struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); @@ -87,6 +97,16 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); +static inline void lock_mount_hash(void) +{ + write_seqlock(&mount_lock); +} + +static inline void unlock_mount_hash(void) +{ + write_sequnlock(&mount_lock); +} + static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); @@ -210,6 +230,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + mnt->mnt.mnt_userns = &init_user_ns; } return mnt; @@ -360,50 +381,36 @@ int mnt_want_write(struct vfsmount *m) EXPORT_SYMBOL_GPL(mnt_want_write); /** - * mnt_clone_write - get write access to a mount - * @mnt: the mount on which to take a write - * - * This is effectively like mnt_want_write, except - * it must only be used to take an extra write reference - * on a mountpoint that we already know has a write reference - * on it. This allows some optimisation. - * - * After finished, mnt_drop_write must be called as usual to - * drop the reference. - */ -int mnt_clone_write(struct vfsmount *mnt) -{ - /* superblock may be r/o */ - if (__mnt_is_readonly(mnt)) - return -EROFS; - preempt_disable(); - mnt_inc_writers(real_mount(mnt)); - preempt_enable(); - return 0; -} -EXPORT_SYMBOL_GPL(mnt_clone_write); - -/** * __mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like __mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already + * This is like __mnt_want_write, but if the file is already open for writing it + * skips incrementing mnt_writers (since the open file already has a reference) + * and instead only does the check for emergency r/o remounts. This must be + * paired with __mnt_drop_write_file. */ int __mnt_want_write_file(struct file *file) { - if (!(file->f_mode & FMODE_WRITER)) - return __mnt_want_write(file->f_path.mnt); - else - return mnt_clone_write(file->f_path.mnt); + if (file->f_mode & FMODE_WRITER) { + /* + * Superblock may have become readonly while there are still + * writable fd's, e.g. due to a fs error with errors=remount-ro + */ + if (__mnt_is_readonly(file->f_path.mnt)) + return -EROFS; + return 0; + } + return __mnt_want_write(file->f_path.mnt); } /** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already + * This is like mnt_want_write, but if the file is already open for writing it + * skips incrementing mnt_writers (since the open file already has a reference) + * and instead only does the freeze protection and the check for emergency r/o + * remounts. This must be paired with mnt_drop_write_file. */ int mnt_want_write_file(struct file *file) { @@ -449,7 +456,8 @@ EXPORT_SYMBOL_GPL(mnt_drop_write); void __mnt_drop_write_file(struct file *file) { - __mnt_drop_write(file->f_path.mnt); + if (!(file->f_mode & FMODE_WRITER)) + __mnt_drop_write(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) @@ -459,11 +467,8 @@ void mnt_drop_write_file(struct file *file) } EXPORT_SYMBOL(mnt_drop_write_file); -static int mnt_make_readonly(struct mount *mnt) +static inline int mnt_hold_writers(struct mount *mnt) { - int ret = 0; - - lock_mount_hash(); mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store @@ -488,25 +493,30 @@ static int mnt_make_readonly(struct mount *mnt) * we're counting up here. */ if (mnt_get_writers(mnt) > 0) - ret = -EBUSY; - else - mnt->mnt.mnt_flags |= MNT_READONLY; + return -EBUSY; + + return 0; +} + +static inline void mnt_unhold_writers(struct mount *mnt) +{ /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; - unlock_mount_hash(); - return ret; } -static int __mnt_unmake_readonly(struct mount *mnt) +static int mnt_make_readonly(struct mount *mnt) { - lock_mount_hash(); - mnt->mnt.mnt_flags &= ~MNT_READONLY; - unlock_mount_hash(); - return 0; + int ret; + + ret = mnt_hold_writers(mnt); + if (!ret) + mnt->mnt.mnt_flags |= MNT_READONLY; + mnt_unhold_writers(mnt); + return ret; } int sb_prepare_remount_readonly(struct super_block *sb) @@ -547,6 +557,11 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { + struct user_namespace *mnt_userns; + + mnt_userns = mnt_user_ns(&mnt->mnt); + if (mnt_userns != &init_user_ns) + put_user_ns(mnt_userns); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); @@ -1055,6 +1070,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); + mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); + if (mnt->mnt.mnt_userns != &init_user_ns) + mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; @@ -2514,20 +2532,15 @@ static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) if (readonly_request) return mnt_make_readonly(mnt); - return __mnt_unmake_readonly(mnt); + mnt->mnt.mnt_flags &= ~MNT_READONLY; + return 0; } -/* - * Update the user-settable attributes on a mount. The caller must hold - * sb->s_umount for writing. - */ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) { - lock_mount_hash(); mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); - unlock_mount_hash(); } static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) @@ -2572,11 +2585,17 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; - down_write(&sb->s_umount); + /* + * We're only checking whether the superblock is read-only not + * changing it, so only take down_read(&sb->s_umount). + */ + down_read(&sb->s_umount); + lock_mount_hash(); ret = change_mount_ro_state(mnt, mnt_flags); if (ret == 0) set_mount_attributes(mnt, mnt_flags); - up_write(&sb->s_umount); + unlock_mount_hash(); + up_read(&sb->s_umount); mnt_warn_timestamp_expiry(path, &mnt->mnt); @@ -2616,8 +2635,11 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, err = -EPERM; if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { err = reconfigure_super(fc); - if (!err) + if (!err) { + lock_mount_hash(); set_mount_attributes(mnt, mnt_flags); + unlock_mount_hash(); + } } up_write(&sb->s_umount); } @@ -3440,6 +3462,33 @@ out_type: return ret; } +#define FSMOUNT_VALID_FLAGS \ + (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ + MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME) + +#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) + +#define MOUNT_SETATTR_PROPAGATION_FLAGS \ + (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED) + +static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) +{ + unsigned int mnt_flags = 0; + + if (attr_flags & MOUNT_ATTR_RDONLY) + mnt_flags |= MNT_READONLY; + if (attr_flags & MOUNT_ATTR_NOSUID) + mnt_flags |= MNT_NOSUID; + if (attr_flags & MOUNT_ATTR_NODEV) + mnt_flags |= MNT_NODEV; + if (attr_flags & MOUNT_ATTR_NOEXEC) + mnt_flags |= MNT_NOEXEC; + if (attr_flags & MOUNT_ATTR_NODIRATIME) + mnt_flags |= MNT_NODIRATIME; + + return mnt_flags; +} + /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. @@ -3462,24 +3511,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) return -EINVAL; - if (attr_flags & ~(MOUNT_ATTR_RDONLY | - MOUNT_ATTR_NOSUID | - MOUNT_ATTR_NODEV | - MOUNT_ATTR_NOEXEC | - MOUNT_ATTR__ATIME | - MOUNT_ATTR_NODIRATIME)) + if (attr_flags & ~FSMOUNT_VALID_FLAGS) return -EINVAL; - if (attr_flags & MOUNT_ATTR_RDONLY) - mnt_flags |= MNT_READONLY; - if (attr_flags & MOUNT_ATTR_NOSUID) - mnt_flags |= MNT_NOSUID; - if (attr_flags & MOUNT_ATTR_NODEV) - mnt_flags |= MNT_NODEV; - if (attr_flags & MOUNT_ATTR_NOEXEC) - mnt_flags |= MNT_NOEXEC; - if (attr_flags & MOUNT_ATTR_NODIRATIME) - mnt_flags |= MNT_NODIRATIME; + mnt_flags = attr_flags_to_mnt_flags(attr_flags); switch (attr_flags & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_STRICTATIME: @@ -3787,6 +3822,362 @@ out0: return error; } +static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) +{ + unsigned int flags = mnt->mnt.mnt_flags; + + /* flags to clear */ + flags &= ~kattr->attr_clr; + /* flags to raise */ + flags |= kattr->attr_set; + + return flags; +} + +static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) +{ + struct vfsmount *m = &mnt->mnt; + + if (!kattr->mnt_userns) + return 0; + + /* + * Once a mount has been idmapped we don't allow it to change its + * mapping. It makes things simpler and callers can just create + * another bind-mount they can idmap if they want to. + */ + if (mnt_user_ns(m) != &init_user_ns) + return -EPERM; + + /* The underlying filesystem doesn't support idmapped mounts yet. */ + if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) + return -EINVAL; + + /* We're not controlling the superblock. */ + if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Mount has already been visible in the filesystem hierarchy. */ + if (!is_anon_ns(mnt->mnt_ns)) + return -EINVAL; + + return 0; +} + +static struct mount *mount_setattr_prepare(struct mount_kattr *kattr, + struct mount *mnt, int *err) +{ + struct mount *m = mnt, *last = NULL; + + if (!is_mounted(&m->mnt)) { + *err = -EINVAL; + goto out; + } + + if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) { + *err = -EINVAL; + goto out; + } + + do { + unsigned int flags; + + flags = recalc_flags(kattr, m); + if (!can_change_locked_flags(m, flags)) { + *err = -EPERM; + goto out; + } + + *err = can_idmap_mount(kattr, m); + if (*err) + goto out; + + last = m; + + if ((kattr->attr_set & MNT_READONLY) && + !(m->mnt.mnt_flags & MNT_READONLY)) { + *err = mnt_hold_writers(m); + if (*err) + goto out; + } + } while (kattr->recurse && (m = next_mnt(m, mnt))); + +out: + return last; +} + +static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) +{ + struct user_namespace *mnt_userns; + + if (!kattr->mnt_userns) + return; + + mnt_userns = get_user_ns(kattr->mnt_userns); + /* Pairs with smp_load_acquire() in mnt_user_ns(). */ + smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); +} + +static void mount_setattr_commit(struct mount_kattr *kattr, + struct mount *mnt, struct mount *last, + int err) +{ + struct mount *m = mnt; + + do { + if (!err) { + unsigned int flags; + + do_idmap_mount(kattr, m); + flags = recalc_flags(kattr, m); + WRITE_ONCE(m->mnt.mnt_flags, flags); + } + + /* + * We either set MNT_READONLY above so make it visible + * before ~MNT_WRITE_HOLD or we failed to recursively + * apply mount options. + */ + if ((kattr->attr_set & MNT_READONLY) && + (m->mnt.mnt_flags & MNT_WRITE_HOLD)) + mnt_unhold_writers(m); + + if (!err && kattr->propagation) + change_mnt_propagation(m, kattr->propagation); + + /* + * On failure, only cleanup until we found the first mount + * we failed to handle. + */ + if (err && m == last) + break; + } while (kattr->recurse && (m = next_mnt(m, mnt))); + + if (!err) + touch_mnt_namespace(mnt->mnt_ns); +} + +static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) +{ + struct mount *mnt = real_mount(path->mnt), *last = NULL; + int err = 0; + + if (path->dentry != mnt->mnt.mnt_root) + return -EINVAL; + + if (kattr->propagation) { + /* + * Only take namespace_lock() if we're actually changing + * propagation. + */ + namespace_lock(); + if (kattr->propagation == MS_SHARED) { + err = invent_group_ids(mnt, kattr->recurse); + if (err) { + namespace_unlock(); + return err; + } + } + } + + lock_mount_hash(); + + /* + * Get the mount tree in a shape where we can change mount + * properties without failure. + */ + last = mount_setattr_prepare(kattr, mnt, &err); + if (last) /* Commit all changes or revert to the old state. */ + mount_setattr_commit(kattr, mnt, last, err); + + unlock_mount_hash(); + + if (kattr->propagation) { + namespace_unlock(); + if (err) + cleanup_group_ids(mnt, NULL); + } + + return err; +} + +static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, + struct mount_kattr *kattr, unsigned int flags) +{ + int err = 0; + struct ns_common *ns; + struct user_namespace *mnt_userns; + struct file *file; + + if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) + return 0; + + /* + * We currently do not support clearing an idmapped mount. If this ever + * is a use-case we can revisit this but for now let's keep it simple + * and not allow it. + */ + if (attr->attr_clr & MOUNT_ATTR_IDMAP) + return -EINVAL; + + if (attr->userns_fd > INT_MAX) + return -EINVAL; + + file = fget(attr->userns_fd); + if (!file) + return -EBADF; + + if (!proc_ns_file(file)) { + err = -EINVAL; + goto out_fput; + } + + ns = get_proc_ns(file_inode(file)); + if (ns->ops->type != CLONE_NEWUSER) { + err = -EINVAL; + goto out_fput; + } + + /* + * The init_user_ns is used to indicate that a vfsmount is not idmapped. + * This is simpler than just having to treat NULL as unmapped. Users + * wanting to idmap a mount to init_user_ns can just use a namespace + * with an identity mapping. + */ + mnt_userns = container_of(ns, struct user_namespace, ns); + if (mnt_userns == &init_user_ns) { + err = -EPERM; + goto out_fput; + } + kattr->mnt_userns = get_user_ns(mnt_userns); + +out_fput: + fput(file); + return err; +} + +static int build_mount_kattr(const struct mount_attr *attr, size_t usize, + struct mount_kattr *kattr, unsigned int flags) +{ + unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + *kattr = (struct mount_kattr) { + .lookup_flags = lookup_flags, + .recurse = !!(flags & AT_RECURSIVE), + }; + + if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) + return -EINVAL; + if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) + return -EINVAL; + kattr->propagation = attr->propagation; + + if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS) + return -EINVAL; + + kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set); + kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr); + + /* + * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap, + * users wanting to transition to a different atime setting cannot + * simply specify the atime setting in @attr_set, but must also + * specify MOUNT_ATTR__ATIME in the @attr_clr field. + * So ensure that MOUNT_ATTR__ATIME can't be partially set in + * @attr_clr and that @attr_set can't have any atime bits set if + * MOUNT_ATTR__ATIME isn't set in @attr_clr. + */ + if (attr->attr_clr & MOUNT_ATTR__ATIME) { + if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME) + return -EINVAL; + + /* + * Clear all previous time settings as they are mutually + * exclusive. + */ + kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME; + switch (attr->attr_set & MOUNT_ATTR__ATIME) { + case MOUNT_ATTR_RELATIME: + kattr->attr_set |= MNT_RELATIME; + break; + case MOUNT_ATTR_NOATIME: + kattr->attr_set |= MNT_NOATIME; + break; + case MOUNT_ATTR_STRICTATIME: + break; + default: + return -EINVAL; + } + } else { + if (attr->attr_set & MOUNT_ATTR__ATIME) + return -EINVAL; + } + + return build_mount_idmapped(attr, usize, kattr, flags); +} + +static void finish_mount_kattr(struct mount_kattr *kattr) +{ + put_user_ns(kattr->mnt_userns); + kattr->mnt_userns = NULL; +} + +SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, + unsigned int, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + int err; + struct path target; + struct mount_attr attr; + struct mount_kattr kattr; + + BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); + + if (flags & ~(AT_EMPTY_PATH | + AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | + AT_NO_AUTOMOUNT)) + return -EINVAL; + + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) + return -EINVAL; + + if (!may_mount()) + return -EPERM; + + err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); + if (err) + return err; + + /* Don't bother walking through the mounts if this is a nop. */ + if (attr.attr_set == 0 && + attr.attr_clr == 0 && + attr.propagation == 0) + return 0; + + err = build_mount_kattr(&attr, usize, &kattr, flags); + if (err) + return err; + + err = user_path_at(dfd, path, kattr.lookup_flags, &target); + if (err) + return err; + + err = do_mount_setattr(&target, &kattr); + finish_mount_kattr(&kattr); + path_put(&target); + return err; +} + static void __init init_mount_tree(void) { struct vfsmount *mnt; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 1a96ce28efb0..fe860c538747 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -115,13 +115,13 @@ bl_submit_bio(struct bio *bio) return NULL; } -static struct bio * -bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, +static struct bio *bl_alloc_init_bio(unsigned int npg, + struct block_device *bdev, sector_t disk_sector, bio_end_io_t end_io, struct parallel_io *par) { struct bio *bio; - npg = min(npg, BIO_MAX_PAGES); + npg = bio_max_segs(npg); bio = bio_alloc(GFP_NOIO, npg); if (bio) { bio->bi_iter.bi_sector = disk_sector; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ef827ae193d2..19a9f434442f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2095,8 +2095,8 @@ EXPORT_SYMBOL_GPL(nfs_instantiate); * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -int nfs_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +int nfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct iattr attr; int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; @@ -2124,7 +2124,8 @@ EXPORT_SYMBOL_GPL(nfs_create); * See comments for nfs_proc_create regarding failed operations. */ int -nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +nfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; int status; @@ -2150,7 +2151,8 @@ EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +int nfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct iattr attr; int error; @@ -2295,7 +2297,8 @@ EXPORT_SYMBOL_GPL(nfs_unlink); * now have a new file handle and can instantiate an in-core NFS inode * and move the raw page into its mapping. */ -int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +int nfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct page *page; char *kaddr; @@ -2398,9 +2401,9 @@ EXPORT_SYMBOL_GPL(nfs_link); * If these conditions are met, we can drop the dentries before doing * the rename. */ -int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); @@ -2939,7 +2942,9 @@ static int nfs_execute_ok(struct inode *inode, int mask) return ret; } -int nfs_permission(struct inode *inode, int mask) +int nfs_permission(struct user_namespace *mnt_userns, + struct inode *inode, + int mask) { const struct cred *cred = current_cred(); int res = 0; @@ -2987,7 +2992,7 @@ out_notsup: res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res == 0) - res = generic_permission(inode, mask); + res = generic_permission(&init_user_ns, inode, mask); goto out; } EXPORT_SYMBOL_GPL(nfs_permission); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 63940a7a70be..16ad5050e046 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -89,7 +89,7 @@ nfs_file_release(struct inode *inode, struct file *filp) EXPORT_SYMBOL_GPL(nfs_file_release); /** - * nfs_revalidate_size - Revalidate the file size + * nfs_revalidate_file_size - Revalidate the file size * @inode: pointer to inode struct * @filp: pointer to struct file * @@ -606,8 +606,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - unsigned long written = 0; - ssize_t result; + unsigned int mntflags = NFS_SERVER(inode)->flags; + ssize_t result, written; errseq_t since; int error; @@ -626,13 +626,13 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) /* * O_APPEND implies that we must revalidate the file length. */ - if (iocb->ki_flags & IOCB_APPEND) { + if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) { result = nfs_revalidate_file_size(inode, file); if (result) goto out; } - if (iocb->ki_pos > i_size_read(inode)) - nfs_revalidate_mapping(inode, file->f_mapping); + + nfs_clear_invalid_mapping(file->f_mapping); since = filemap_sample_wb_err(file->f_mapping); nfs_start_io_write(inode); @@ -648,6 +648,21 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) written = result; iocb->ki_pos += written; + + if (mntflags & NFS_MOUNT_WRITE_EAGER) { + result = filemap_fdatawrite_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + if (result < 0) + goto out; + } + if (mntflags & NFS_MOUNT_WRITE_WAIT) { + result = filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + if (result < 0) + goto out; + } result = generic_write_sync(iocb, written); if (result < 0) goto out; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 06894bcdea2d..971a9251c1d9 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -82,6 +82,7 @@ enum nfs_param { Opt_v, Opt_vers, Opt_wsize, + Opt_write, }; enum { @@ -113,6 +114,19 @@ static const struct constant_table nfs_param_enums_lookupcache[] = { {} }; +enum { + Opt_write_lazy, + Opt_write_eager, + Opt_write_wait, +}; + +static const struct constant_table nfs_param_enums_write[] = { + { "lazy", Opt_write_lazy }, + { "eager", Opt_write_eager }, + { "wait", Opt_write_wait }, + {} +}; + static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_flag_no("ac", Opt_ac), fsparam_u32 ("acdirmax", Opt_acdirmax), @@ -171,6 +185,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_flag ("v4.1", Opt_v), fsparam_flag ("v4.2", Opt_v), fsparam_string("vers", Opt_vers), + fsparam_enum ("write", Opt_write, nfs_param_enums_write), fsparam_u32 ("wsize", Opt_wsize), {} }; @@ -770,6 +785,24 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, goto out_invalid_value; } break; + case Opt_write: + switch (result.uint_32) { + case Opt_write_lazy: + ctx->flags &= + ~(NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT); + break; + case Opt_write_eager: + ctx->flags |= NFS_MOUNT_WRITE_EAGER; + ctx->flags &= ~NFS_MOUNT_WRITE_WAIT; + break; + case Opt_write_wait: + ctx->flags |= + NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT; + break; + default: + goto out_invalid_value; + } + break; /* * Special options @@ -1479,6 +1512,8 @@ static int nfs_init_fs_context(struct fs_context *fc) ctx->selected_flavor = RPC_AUTH_MAXFLAVOR; ctx->minorversion = 0; ctx->need_mount = true; + + fc->s_iflags |= SB_I_STABLE_WRITES; } fc->fs_private = ctx; fc->ops = &nfs_fs_context_ops; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index a60df88efc40..c4c021c6ebbd 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -390,10 +390,6 @@ static void nfs_readpage_from_fscache_complete(struct page *page, if (!error) { SetPageUptodate(page); unlock_page(page); - } else { - error = nfs_readpage_async(context, page->mapping->host, page); - if (error) - unlock_page(page); } } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 522aa10a1a3e..749bbea14d99 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -195,6 +195,18 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) } EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); +#ifdef CONFIG_NFS_V4_2 +static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) +{ + return nfsi->xattr_cache != NULL; +} +#else +static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) +{ + return false; +} +#endif + static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); @@ -209,6 +221,8 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) | NFS_INO_INVALID_XATTR); } + if (!nfs_has_xattr_cache(nfsi)) + flags &= ~NFS_INO_INVALID_XATTR; if (inode->i_mapping->nrpages == 0) flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER); nfsi->cache_validity |= flags; @@ -594,7 +608,8 @@ EXPORT_SYMBOL_GPL(nfs_fhget); #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int -nfs_setattr(struct dentry *dentry, struct iattr *attr) +nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; @@ -787,8 +802,8 @@ static bool nfs_need_revalidate_inode(struct inode *inode) return false; } -int nfs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct nfs_server *server = NFS_SERVER(inode); @@ -857,7 +872,7 @@ out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask &= request_mask; out_no_update: - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; @@ -1257,55 +1272,19 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map return 0; } -bool nfs_mapping_need_revalidate_inode(struct inode *inode) -{ - return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) || - NFS_STALE(inode); -} - -int nfs_revalidate_mapping_rcu(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - unsigned long *bitlock = &nfsi->flags; - int ret = 0; - - if (IS_SWAPFILE(inode)) - goto out; - if (nfs_mapping_need_revalidate_inode(inode)) { - ret = -ECHILD; - goto out; - } - spin_lock(&inode->i_lock); - if (test_bit(NFS_INO_INVALIDATING, bitlock) || - (nfsi->cache_validity & NFS_INO_INVALID_DATA)) - ret = -ECHILD; - spin_unlock(&inode->i_lock); -out: - return ret; -} - /** - * nfs_revalidate_mapping - Revalidate the pagecache - * @inode: pointer to host inode + * nfs_clear_invalid_mapping - Conditionally clear a mapping * @mapping: pointer to mapping + * + * If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping. */ -int nfs_revalidate_mapping(struct inode *inode, - struct address_space *mapping) +int nfs_clear_invalid_mapping(struct address_space *mapping) { + struct inode *inode = mapping->host; struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; int ret = 0; - /* swapfiles are not supposed to be shared. */ - if (IS_SWAPFILE(inode)) - goto out; - - if (nfs_mapping_need_revalidate_inode(inode)) { - ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (ret < 0) - goto out; - } - /* * We must clear NFS_INO_INVALID_DATA first to ensure that * invalidations that come in while we're shooting down the mappings @@ -1336,8 +1315,8 @@ int nfs_revalidate_mapping(struct inode *inode, set_bit(NFS_INO_INVALIDATING, bitlock); smp_wmb(); - nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA| - NFS_INO_DATA_INVAL_DEFER); + nfsi->cache_validity &= + ~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER); spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); ret = nfs_invalidate_mapping(inode, mapping); @@ -1350,6 +1329,53 @@ out: return ret; } +bool nfs_mapping_need_revalidate_inode(struct inode *inode) +{ + return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) || + NFS_STALE(inode); +} + +int nfs_revalidate_mapping_rcu(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long *bitlock = &nfsi->flags; + int ret = 0; + + if (IS_SWAPFILE(inode)) + goto out; + if (nfs_mapping_need_revalidate_inode(inode)) { + ret = -ECHILD; + goto out; + } + spin_lock(&inode->i_lock); + if (test_bit(NFS_INO_INVALIDATING, bitlock) || + (nfsi->cache_validity & NFS_INO_INVALID_DATA)) + ret = -ECHILD; + spin_unlock(&inode->i_lock); +out: + return ret; +} + +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode: pointer to host inode + * @mapping: pointer to mapping + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + /* swapfiles are not supposed to be shared. */ + if (IS_SWAPFILE(inode)) + return 0; + + if (nfs_mapping_need_revalidate_inode(inode)) { + int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret < 0) + return ret; + } + + return nfs_clear_invalid_mapping(mapping); +} + static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 62d3189745cd..25fb43b69e5a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -378,14 +378,18 @@ extern unsigned long nfs_access_cache_count(struct shrinker *shrink, extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); -int nfs_create(struct inode *, struct dentry *, umode_t, bool); -int nfs_mkdir(struct inode *, struct dentry *, umode_t); +int nfs_create(struct user_namespace *, struct inode *, struct dentry *, + umode_t, bool); +int nfs_mkdir(struct user_namespace *, struct inode *, struct dentry *, + umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); -int nfs_symlink(struct inode *, struct dentry *, const char *); +int nfs_symlink(struct user_namespace *, struct inode *, struct dentry *, + const char *); int nfs_link(struct dentry *, struct inode *, struct dentry *); -int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); -int nfs_rename(struct inode *, struct dentry *, +int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t, + dev_t); +int nfs_rename(struct user_namespace *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); /* file.c */ diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 2bcbe38afe2e..93e60e921f92 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -208,20 +208,23 @@ out_fc: } static int -nfs_namespace_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +nfs_namespace_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { if (NFS_FH(d_inode(path->dentry))->size != 0) - return nfs_getattr(path, stat, request_mask, query_flags); - generic_fillattr(d_inode(path->dentry), stat); + return nfs_getattr(mnt_userns, path, stat, request_mask, + query_flags); + generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); return 0; } static int -nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +nfs_namespace_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { if (NFS_FH(d_inode(dentry))->size != 0) - return nfs_setattr(dentry, attr); + return nfs_setattr(mnt_userns, dentry, attr); return -EACCES; } diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index 1b950b66b3bb..c8a192802dda 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -12,7 +12,8 @@ */ #ifdef CONFIG_NFS_V3_ACL extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type); -extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl); extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index c6c863382f37..bb386a691e69 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -111,6 +111,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) fallthrough; case -ENOTSUPP: status = -EOPNOTSUPP; + goto getout; default: goto getout; } @@ -251,7 +252,8 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, } -int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { struct posix_acl *orig = acl, *dfacl = NULL, *alloc; int status; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 86acffe7335c..889a9f4c0310 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -609,6 +609,7 @@ found: * changed. Schedule recovery! */ nfs4_schedule_path_down_recovery(pos); + goto out; default: goto out; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2f4679a62712..74bc5120013d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -71,10 +71,6 @@ #include "nfs4trace.h" -#ifdef CONFIG_NFS_V4_2 -#include "nfs42.h" -#endif /* CONFIG_NFS_V4_2 */ - #define NFSDBG_FACILITY NFSDBG_PROC #define NFS4_BITMASK_SZ 3 @@ -2231,6 +2227,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct default: printk(KERN_ERR "NFS: %s: unhandled error " "%d.\n", __func__, err); + fallthrough; case 0: case -ENOENT: case -EAGAIN: @@ -5438,15 +5435,16 @@ static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode, if (cache_validity & NFS_INO_INVALID_ATIME) bitmask[1] |= FATTR4_WORD1_TIME_ACCESS; - if (cache_validity & NFS_INO_INVALID_ACCESS) - bitmask[0] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | - FATTR4_WORD1_OWNER_GROUP; - if (cache_validity & NFS_INO_INVALID_ACL) - bitmask[0] |= FATTR4_WORD0_ACL; - if (cache_validity & NFS_INO_INVALID_LABEL) + if (cache_validity & NFS_INO_INVALID_OTHER) + bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | + FATTR4_WORD1_OWNER_GROUP | + FATTR4_WORD1_NUMLINKS; + if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL) bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL; - if (cache_validity & NFS_INO_INVALID_CTIME) + if (cache_validity & NFS_INO_INVALID_CHANGE) bitmask[0] |= FATTR4_WORD0_CHANGE; + if (cache_validity & NFS_INO_INVALID_CTIME) + bitmask[1] |= FATTR4_WORD1_TIME_METADATA; if (cache_validity & NFS_INO_INVALID_MTIME) bitmask[1] |= FATTR4_WORD1_TIME_MODIFY; if (cache_validity & NFS_INO_INVALID_SIZE) @@ -7491,6 +7489,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7513,6 +7512,7 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7563,6 +7563,7 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) #ifdef CONFIG_NFS_V4_2 static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -9705,6 +9706,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case -NFS4ERR_BADLAYOUT: /* no layout */ case -NFS4ERR_GRACE: /* loca_recalim always false */ task->tk_status = 0; + break; case 0: break; default: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 4bf10792cb5b..3a51351bdc6a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1125,6 +1125,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) " sequence-id error on an" " unconfirmed sequence %p!\n", seqid->sequence); + return; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index af64b4e6fd1f..102b66e0bdef 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2875,6 +2875,7 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc, switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_write_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: @@ -3019,6 +3020,7 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_read_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: diff --git a/fs/nfs/read.c b/fs/nfs/read.c index eb854f1f86e2..d2b6dce1f99f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -74,6 +74,24 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); +static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + struct nfs_pgio_mirror *pgm; + unsigned long npages; + + nfs_pageio_complete(pgio); + + /* It doesn't make sense to do mirrored reads! */ + WARN_ON_ONCE(pgio->pg_mirror_count != 1); + + pgm = &pgio->pg_mirrors[0]; + NFS_I(inode)->read_io += pgm->pg_bytes_written; + npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); +} + + void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *mirror; @@ -114,41 +132,10 @@ static void nfs_readpage_release(struct nfs_page *req, int error) nfs_release_request(req); } -int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, - struct page *page) -{ - struct nfs_page *new; - unsigned int len; +struct nfs_readdesc { struct nfs_pageio_descriptor pgio; - struct nfs_pgio_mirror *pgm; - - len = nfs_page_length(page); - if (len == 0) - return nfs_return_empty_page(page); - new = nfs_create_request(ctx, page, 0, len); - if (IS_ERR(new)) { - unlock_page(page); - return PTR_ERR(new); - } - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); - - nfs_pageio_init_read(&pgio, inode, false, - &nfs_async_read_completion_ops); - if (!nfs_pageio_add_request(&pgio, new)) { - nfs_list_remove_request(new); - nfs_readpage_release(new, pgio.pg_error); - } - nfs_pageio_complete(&pgio); - - /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(pgio.pg_mirror_count != 1); - - pgm = &pgio.pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; - - return pgio.pg_error < 0 ? pgio.pg_error : 0; -} + struct nfs_open_context *ctx; +}; static void nfs_page_group_set_uptodate(struct nfs_page *req) { @@ -171,8 +158,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { /* note: regions of the page not covered by a - * request are zeroed in nfs_readpage_async / - * readpage_async_filler */ + * request are zeroed in readpage_async_filler */ if (bytes > hdr->good_bytes) { /* nothing in this request was good, so zero * the full extent of the request */ @@ -304,6 +290,38 @@ static void nfs_readpage_result(struct rpc_task *task, nfs_readpage_retry(task, hdr); } +static int +readpage_async_filler(void *data, struct page *page) +{ + struct nfs_readdesc *desc = data; + struct nfs_page *new; + unsigned int len; + int error; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); + + new = nfs_create_request(desc->ctx, page, 0, len); + if (IS_ERR(new)) + goto out_error; + + if (len < PAGE_SIZE) + zero_user_segment(page, len, PAGE_SIZE); + if (!nfs_pageio_add_request(&desc->pgio, new)) { + nfs_list_remove_request(new); + error = desc->pgio.pg_error; + nfs_readpage_release(new, error); + goto out; + } + return 0; +out_error: + error = PTR_ERR(new); + unlock_page(page); +out: + return error; +} + /* * Read a page over NFS. * We read the page synchronously in the following case: @@ -312,14 +330,13 @@ static void nfs_readpage_result(struct rpc_task *task, */ int nfs_readpage(struct file *file, struct page *page) { - struct nfs_open_context *ctx; + struct nfs_readdesc desc; struct inode *inode = page_file_mapping(page)->host; - int error; + int ret; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_SIZE, page_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - nfs_add_stats(inode, NFSIOS_READPAGES, 1); /* * Try to flush any pending writes to the file.. @@ -328,93 +345,59 @@ int nfs_readpage(struct file *file, struct page *page) * be any new pending writes generated at this point * for this page (other pages can be written to). */ - error = nfs_wb_page(inode, page); - if (error) + ret = nfs_wb_page(inode, page); + if (ret) goto out_unlock; if (PageUptodate(page)) goto out_unlock; - error = -ESTALE; + ret = -ESTALE; if (NFS_STALE(inode)) goto out_unlock; if (file == NULL) { - error = -EBADF; - ctx = nfs_find_open_context(inode, NULL, FMODE_READ); - if (ctx == NULL) + ret = -EBADF; + desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); + if (desc.ctx == NULL) goto out_unlock; } else - ctx = get_nfs_open_context(nfs_file_open_context(file)); + desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); if (!IS_SYNC(inode)) { - error = nfs_readpage_from_fscache(ctx, inode, page); - if (error == 0) + ret = nfs_readpage_from_fscache(desc.ctx, inode, page); + if (ret == 0) goto out; } - xchg(&ctx->error, 0); - error = nfs_readpage_async(ctx, inode, page); - if (!error) { - error = wait_on_page_locked_killable(page); - if (!PageUptodate(page) && !error) - error = xchg(&ctx->error, 0); - } -out: - put_nfs_open_context(ctx); - return error; -out_unlock: - unlock_page(page); - return error; -} - -struct nfs_readdesc { - struct nfs_pageio_descriptor *pgio; - struct nfs_open_context *ctx; -}; - -static int -readpage_async_filler(void *data, struct page *page) -{ - struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct nfs_page *new; - unsigned int len; - int error; + xchg(&desc.ctx->error, 0); + nfs_pageio_init_read(&desc.pgio, inode, false, + &nfs_async_read_completion_ops); - len = nfs_page_length(page); - if (len == 0) - return nfs_return_empty_page(page); + ret = readpage_async_filler(&desc, page); - new = nfs_create_request(desc->ctx, page, 0, len); - if (IS_ERR(new)) - goto out_error; + if (!ret) + nfs_pageio_complete_read(&desc.pgio, inode); - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); - if (!nfs_pageio_add_request(desc->pgio, new)) { - nfs_list_remove_request(new); - error = desc->pgio->pg_error; - nfs_readpage_release(new, error); - goto out; + ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; + if (!ret) { + ret = wait_on_page_locked_killable(page); + if (!PageUptodate(page) && !ret) + ret = xchg(&desc.ctx->error, 0); } - return 0; -out_error: - error = PTR_ERR(new); - unlock_page(page); out: - return error; + put_nfs_open_context(desc.ctx); + return ret; +out_unlock: + unlock_page(page); + return ret; } -int nfs_readpages(struct file *filp, struct address_space *mapping, +int nfs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct nfs_pageio_descriptor pgio; - struct nfs_pgio_mirror *pgm; - struct nfs_readdesc desc = { - .pgio = &pgio, - }; + struct nfs_readdesc desc; struct inode *inode = mapping->host; - unsigned long npages; - int ret = -ESTALE; + int ret; dprintk("NFS: nfs_readpages (%s/%Lu %d)\n", inode->i_sb->s_id, @@ -422,15 +405,17 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); + ret = -ESTALE; if (NFS_STALE(inode)) goto out; - if (filp == NULL) { + if (file == NULL) { + ret = -EBADF; desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); if (desc.ctx == NULL) - return -EBADF; + goto out; } else - desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); + desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); /* attempt to read as many of the pages as possible from the cache * - this returns -ENOBUFS immediately if the cookie is negative @@ -440,20 +425,13 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - nfs_pageio_init_read(&pgio, inode, false, + nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); - nfs_pageio_complete(&pgio); - /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(pgio.pg_mirror_count != 1); + nfs_pageio_complete_read(&desc.pgio, inode); - pgm = &pgio.pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; - npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> - PAGE_SHIFT; - nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: put_nfs_open_context(desc.ctx); out: diff --git a/fs/nfs/super.c b/fs/nfs/super.c index c7a924580eec..94885c6f8f54 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -523,6 +523,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_puts(m, ",local_lock=flock"); else seq_puts(m, ",local_lock=posix"); + + if (nfss->flags & NFS_MOUNT_WRITE_EAGER) { + if (nfss->flags & NFS_MOUNT_WRITE_WAIT) + seq_puts(m, ",write=wait"); + else + seq_puts(m, ",write=eager"); + } } /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 639c34fec04a..82bdcb982186 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -712,16 +712,23 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct nfs_pageio_descriptor pgio; - struct nfs_io_completion *ioc; + struct nfs_io_completion *ioc = NULL; + unsigned int mntflags = NFS_SERVER(inode)->flags; + int priority = 0; int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - ioc = nfs_io_completion_alloc(GFP_KERNEL); - if (ioc) - nfs_io_completion_init(ioc, nfs_io_completion_commit, inode); + if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate || + wbc->for_background || wbc->for_sync || wbc->for_reclaim) { + ioc = nfs_io_completion_alloc(GFP_KERNEL); + if (ioc) + nfs_io_completion_init(ioc, nfs_io_completion_commit, + inode); + priority = wb_priority(wbc); + } - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, + nfs_pageio_init_write(&pgio, inode, priority, false, &nfs_async_write_completion_ops); pgio.pg_io_completion = ioc; err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); @@ -1278,19 +1285,21 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) * the PageUptodate() flag. In this case, we will need to turn off * write optimisations that depend on the page contents being correct. */ -static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) +static bool nfs_write_pageuptodate(struct page *page, struct inode *inode, + unsigned int pagelen) { struct nfs_inode *nfsi = NFS_I(inode); if (nfs_have_delegated_attributes(inode)) goto out; - if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + if (nfsi->cache_validity & + (NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_SIZE)) return false; smp_rmb(); - if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) + if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags) && pagelen != 0) return false; out: - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + if (nfsi->cache_validity & NFS_INO_INVALID_DATA && pagelen != 0) return false; return PageUptodate(page) != 0; } @@ -1310,7 +1319,8 @@ is_whole_file_wrlock(struct file_lock *fl) * If the file is opened for synchronous writes then we can just skip the rest * of the checks. */ -static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) +static int nfs_can_extend_write(struct file *file, struct page *page, + struct inode *inode, unsigned int pagelen) { int ret; struct file_lock_context *flctx = inode->i_flctx; @@ -1318,7 +1328,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino if (file->f_flags & O_DSYNC) return 0; - if (!nfs_write_pageuptodate(page, inode)) + if (!nfs_write_pageuptodate(page, inode, pagelen)) return 0; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) return 1; @@ -1356,6 +1366,7 @@ int nfs_updatepage(struct file *file, struct page *page, struct nfs_open_context *ctx = nfs_file_open_context(file); struct address_space *mapping = page_file_mapping(page); struct inode *inode = mapping->host; + unsigned int pagelen = nfs_page_length(page); int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); @@ -1366,8 +1377,8 @@ int nfs_updatepage(struct file *file, struct page *page, if (!count) goto out; - if (nfs_can_extend_write(file, page, inode)) { - count = max(count + offset, nfs_page_length(page)); + if (nfs_can_extend_write(file, page, inode, pagelen)) { + count = max(count + offset, pagelen); offset = 0; } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 7c863f2c21e0..9421dae22737 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -386,8 +386,9 @@ static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); -static int check_export(struct inode *inode, int *flags, unsigned char *uuid) +static int check_export(struct path *path, int *flags, unsigned char *uuid) { + struct inode *inode = d_inode(path->dentry); /* * We currently export only dirs, regular files, and (for v4 @@ -411,6 +412,7 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid) * or an FSID number (so NFSEXP_FSID or ->uuid is needed). * 2: We must be able to find an inode from a filehandle. * This means that s_export_op must be set. + * 3: We must not currently be on an idmapped mount. */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && !(*flags & NFSEXP_FSID) && @@ -425,6 +427,11 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid) return -EINVAL; } + if (mnt_user_ns(path->mnt) != &init_user_ns) { + dprintk("exp_export: export of idmapped mounts not yet supported.\n"); + return -EINVAL; + } + if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK && !(*flags & NFSEXP_NOSUBTREECHECK)) { dprintk("%s: %s does not support subtree checking!\n", @@ -653,8 +660,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out4; } - err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags, - exp.ex_uuid); + err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; /* diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 7eeac5b81c20..855e17772eba 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -113,10 +113,12 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) fh_lock(fh); - error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); + error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, + argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); + error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT, + argp->acl_default); if (error) goto out_drop_lock; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index a568b842e9eb..9a6f18d74d14 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -103,10 +103,12 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) fh_lock(fh); - error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); + error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, + argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); + error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT, + argp->acl_default); out_drop_lock: fh_unlock(fh); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 71292a0d6f09..eaa3a0cf38f1 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -781,12 +781,13 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, fh_lock(fhp); - host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl); + host_error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, pacl); if (host_error < 0) goto out_drop_lock; if (S_ISDIR(inode->i_mode)) { - host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl); + host_error = set_posix_acl(&init_user_ns, inode, + ACL_TYPE_DEFAULT, dpacl); } out_drop_lock: diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 186fa2c2c6ba..891395c6c7d3 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -233,7 +233,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU); + status = vfs_mkdir(&init_user_ns, d_inode(dir), dentry, S_IRWXU); out_put: dput(dentry); out_unlock: @@ -353,7 +353,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) status = -ENOENT; if (d_really_is_negative(dentry)) goto out; - status = vfs_rmdir(d_inode(dir), dentry); + status = vfs_rmdir(&init_user_ns, d_inode(dir), dentry); out: dput(dentry); out_unlock: @@ -443,7 +443,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(name, nn)) goto out_free; - status = vfs_rmdir(d_inode(parent), child); + status = vfs_rmdir(&init_user_ns, d_inode(parent), child); if (status) printk("failed to remove client recovery directory %pd\n", child); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 4744a276058d..10b44421eace 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -40,7 +40,8 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) /* make sure parents give x permission to user */ int err; parent = dget_parent(tdentry); - err = inode_permission(d_inode(parent), MAY_EXEC); + err = inode_permission(&init_user_ns, + d_inode(parent), MAY_EXEC); if (err < 0) { dput(parent); break; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index b2f8035f166b..a8d5449dd0e9 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -90,7 +90,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) if (delta < 0) delta = -delta; if (delta < MAX_TOUCH_TIME_ERROR && - setattr_prepare(fhp->fh_dentry, iap) != 0) { + setattr_prepare(&init_user_ns, fhp->fh_dentry, iap) != 0) { /* * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. * This will cause notify_change to set these times diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d316e11923c5..fd6be35a1642 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -448,7 +448,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, .ia_size = iap->ia_size, }; - host_err = notify_change(dentry, &size_attr, NULL); + host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL); if (host_err) goto out_unlock; iap->ia_valid &= ~ATTR_SIZE; @@ -463,7 +463,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, } iap->ia_valid |= ATTR_CTIME; - host_err = notify_change(dentry, iap, NULL); + host_err = notify_change(&init_user_ns, dentry, iap, NULL); out_unlock: fh_unlock(fhp); @@ -499,7 +499,8 @@ int nfsd4_is_junction(struct dentry *dentry) return 0; if (!(inode->i_mode & S_ISVTX)) return 0; - if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) + if (vfs_getxattr(&init_user_ns, dentry, NFSD_JUNCTION_XATTR_NAME, + NULL, 0) <= 0) return 0; return 1; } @@ -1254,12 +1255,12 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, host_err = 0; switch (type) { case S_IFREG: - host_err = vfs_create(dirp, dchild, iap->ia_mode, true); + host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); if (!host_err) nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); + host_err = vfs_mkdir(&init_user_ns, dirp, dchild, iap->ia_mode); if (!host_err && unlikely(d_unhashed(dchild))) { struct dentry *d; d = lookup_one_len(dchild->d_name.name, @@ -1287,7 +1288,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, case S_IFBLK: case S_IFIFO: case S_IFSOCK: - host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); + host_err = vfs_mknod(&init_user_ns, dirp, dchild, + iap->ia_mode, rdev); break; default: printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", @@ -1485,7 +1487,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!IS_POSIXACL(dirp)) iap->ia_mode &= ~current_umask(); - host_err = vfs_create(dirp, dchild, iap->ia_mode, true); + host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); if (host_err < 0) { fh_drop_write(fhp); goto out_nfserr; @@ -1609,7 +1611,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - host_err = vfs_symlink(d_inode(dentry), dnew, path); + host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path); err = nfserrno(host_err); if (!err) err = nfserrno(commit_metadata(fhp)); @@ -1677,7 +1679,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserr_noent; if (d_really_is_negative(dold)) goto out_dput; - host_err = vfs_link(dold, dirp, dnew, NULL); + host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); if (!err) @@ -1797,7 +1799,15 @@ retry: close_cached = true; goto out_dput_old; } else { - host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); + struct renamedata rd = { + .old_mnt_userns = &init_user_ns, + .old_dir = fdir, + .old_dentry = odentry, + .new_mnt_userns = &init_user_ns, + .new_dir = tdir, + .new_dentry = ndentry, + }; + host_err = vfs_rename(&rd); if (!host_err) { host_err = commit_metadata(tfhp); if (!host_err) @@ -1884,9 +1894,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (type != S_IFDIR) { if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) nfsd_close_cached_files(rdentry); - host_err = vfs_unlink(dirp, rdentry, NULL); + host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL); } else { - host_err = vfs_rmdir(dirp, rdentry); + host_err = vfs_rmdir(&init_user_ns, dirp, rdentry); } if (!host_err) @@ -2149,7 +2159,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, inode_lock_shared(inode); - len = vfs_getxattr(dentry, name, NULL, 0); + len = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0); /* * Zero-length attribute, just return. @@ -2176,7 +2186,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, goto out; } - len = vfs_getxattr(dentry, name, buf, len); + len = vfs_getxattr(&init_user_ns, dentry, name, buf, len); if (len <= 0) { kvfree(buf); buf = NULL; @@ -2283,7 +2293,8 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) fh_lock(fhp); - ret = __vfs_removexattr_locked(fhp->fh_dentry, name, NULL); + ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry, + name, NULL); fh_unlock(fhp); fh_drop_write(fhp); @@ -2307,8 +2318,8 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, return nfserrno(ret); fh_lock(fhp); - ret = __vfs_setxattr_locked(fhp->fh_dentry, name, buf, len, flags, - NULL); + ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf, + len, flags, NULL); fh_unlock(fhp); fh_drop_write(fhp); @@ -2391,13 +2402,14 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, return 0; /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ - err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC)); + err = inode_permission(&init_user_ns, inode, + acc & (MAY_READ | MAY_WRITE | MAY_EXEC)); /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) - err = inode_permission(inode, MAY_EXEC); + err = inode_permission(&init_user_ns, inode, MAY_EXEC); return err? nfserrno(err) : 0; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 745d371d6fea..2e8eb263cf0f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -348,7 +348,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) /* reference count of i_bh inherits from nilfs_mdt_read_block() */ atomic64_inc(&root->inodes_count); - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = ino; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -805,14 +805,15 @@ void nilfs_evict_inode(struct inode *inode) */ } -int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) +int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct nilfs_transaction_info ti; struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; int err; - err = setattr_prepare(dentry, iattr); + err = setattr_prepare(&init_user_ns, dentry, iattr); if (err) return err; @@ -827,7 +828,7 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) nilfs_truncate(inode); } - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { @@ -843,7 +844,8 @@ out_err: return err; } -int nilfs_permission(struct inode *inode, int mask) +int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { struct nilfs_root *root = NILFS_I(inode)->i_root; @@ -851,7 +853,7 @@ int nilfs_permission(struct inode *inode, int mask) root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 07d26f61f22a..b053b40315bf 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -132,7 +132,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, unsigned int flags, oldflags; int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (get_user(flags, (int __user *)argp)) diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index a6ec7961d4f5..ecace5f96a95 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -72,8 +72,8 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int nilfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; struct nilfs_transaction_info ti; @@ -100,7 +100,8 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } static int -nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +nilfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; struct nilfs_transaction_info ti; @@ -124,8 +125,8 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) return err; } -static int nilfs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int nilfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct nilfs_transaction_info ti; struct super_block *sb = dir->i_sb; @@ -201,7 +202,8 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, return err; } -static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int nilfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct nilfs_transaction_info ti; @@ -338,8 +340,9 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) return err; } -static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, +static int nilfs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index f8450ee3fd06..c4a45a081ade 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -267,9 +267,11 @@ extern struct inode *nilfs_iget_for_gc(struct super_block *sb, extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); -extern int nilfs_setattr(struct dentry *, struct iattr *); +extern int nilfs_setattr(struct user_namespace *, struct dentry *, + struct iattr *); extern void nilfs_write_failed(struct address_space *mapping, loff_t to); -int nilfs_permission(struct inode *inode, int mask); +int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask); int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index b78dd1f88f7c..9e0c1afac8bd 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -702,7 +702,7 @@ static int fanotify_find_path(int dfd, const char __user *filename, } /* you can only watch an inode if you have read permissions on it */ - ret = inode_permission(path->dentry->d_inode, MAY_READ); + ret = path_permission(path, MAY_READ); if (ret) { path_put(path); goto out; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 266d17e8ecb9..c71be4fb7dc5 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -352,7 +352,7 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, if (error) return error; /* you can only watch an inode if you have read permissions on it */ - error = inode_permission(path->dentry->d_inode, MAY_READ); + error = path_permission(path, MAY_READ); if (error) { path_put(path); return error; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index f7e4cbc26eaf..f5c058b3192c 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -629,6 +629,12 @@ static int ntfs_read_locked_inode(struct inode *vi) } a = ctx->attr; /* Get the standard information attribute value. */ + if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset) + + le32_to_cpu(a->data.resident.value_length) > + (u8 *)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode."); + goto unm_err_out; + } si = (STANDARD_INFORMATION*)((u8*)a + le16_to_cpu(a->data.resident.value_offset)); @@ -2848,6 +2854,7 @@ void ntfs_truncate_vfs(struct inode *vi) { /** * ntfs_setattr - called from notify_change() when an attribute is being changed + * @mnt_userns: user namespace of the mount the inode was found from * @dentry: dentry whose attributes to change * @attr: structure describing the attributes and the changes * @@ -2860,13 +2867,14 @@ void ntfs_truncate_vfs(struct inode *vi) { * * Called with ->i_mutex held. */ -int ntfs_setattr(struct dentry *dentry, struct iattr *attr) +int ntfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *vi = d_inode(dentry); int err; unsigned int ia_valid = attr->ia_valid; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) goto out; /* We do not support NTFS ACLs yet. */ diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 363e4e820673..6f78ee00f57f 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -289,7 +289,8 @@ extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); extern int ntfs_truncate(struct inode *vi); extern void ntfs_truncate_vfs(struct inode *vi); -extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); +extern int ntfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr); extern int __ntfs_write_inode(struct inode *vi, int sync); diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index 85422761ff43..5d4bf7a3259f 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -703,7 +703,7 @@ typedef struct { /* 14*/ le16 instance; /* The instance of this attribute record. This number is unique within this mft record (see MFT_RECORD/next_attribute_instance notes in - in mft.h for more details). */ + mft.h for more details). */ /* 16*/ union { /* Resident attributes. */ struct { @@ -1838,7 +1838,7 @@ typedef struct { * Also, each security descriptor is stored twice in the $SDS stream with a * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the - * the first copy of the security descriptor will be at offset 0x51d0 in the + * first copy of the security descriptor will be at offset 0x51d0 in the * $SDS data stream and the second copy will be at offset 0x451d0. */ typedef struct { diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 7b07f5df3a29..5259badabb56 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -262,7 +262,8 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; int status, had_lock; @@ -274,7 +275,8 @@ int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (type == ACL_TYPE_ACCESS && acl) { umode_t mode; - status = posix_acl_update_mode(inode, &mode, &acl); + status = posix_acl_update_mode(&init_user_ns, inode, &mode, + &acl); if (status) goto unlock; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 127b13432146..4e86450917b2 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -19,7 +19,8 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); -int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 0179a73a3fa2..12a7590601dd 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2042,7 +2042,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g o2hb_nego_timeout_handler, reg, NULL, ®->hr_handler_list); if (ret) - goto free; + goto remove_item; ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key, sizeof(struct o2hb_nego_msg), @@ -2057,6 +2057,12 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g unregister_handler: o2net_unregister_handler_list(®->hr_handler_list); +remove_item: + spin_lock(&o2hb_live_lock); + list_del(®->hr_all_item); + if (o2hb_global_heartbeat_active()) + clear_bit(reg->hr_region_num, o2hb_region_bitmap); + spin_unlock(&o2hb_live_lock); free: kfree(reg); return ERR_PTR(ret); diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 6abaded3ff6b..70a10764f249 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -165,16 +165,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) spin_unlock(&lock->spinlock); } -void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) -{ - BUG_ON(!dlm); - BUG_ON(!lock); - - spin_lock(&dlm->ast_lock); - __dlm_queue_bast(dlm, lock); - spin_unlock(&dlm->ast_lock); -} - static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock) { diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index c8a444622faa..58d57e25d384 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -17,10 +17,7 @@ #define DLM_LOCKID_NAME_MAX 32 -#define DLM_DOMAIN_NAME_MAX_LEN 255 #define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES -#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes -#define DLM_THREAD_MS 200 // flush at least every 200 ms #define DLM_HASH_SIZE_DEFAULT (1 << 17) #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE @@ -902,7 +899,6 @@ void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); -void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_do_local_ast(struct dlm_ctxt *dlm, diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 583820ec63e2..b2870f1a31df 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -190,17 +190,18 @@ static int dlmfs_file_release(struct inode *inode, * We do ->setattr() just to override size changes. Our size is the size * of the LVB and nothing else. */ -static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) +static int dlmfs_file_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } @@ -329,7 +330,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); - inode_init_owner(inode, NULL, mode); + inode_init_owner(&init_user_ns, inode, NULL, mode); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inc_nlink(inode); @@ -352,7 +353,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, return NULL; inode->i_ino = get_next_ino(); - inode_init_owner(inode, parent, mode); + inode_init_owner(&init_user_ns, inode, parent, mode); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); ip = DLMFS_I(inode); @@ -395,7 +396,8 @@ static struct inode *dlmfs_get_inode(struct inode *parent, * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ -static int dlmfs_mkdir(struct inode * dir, +static int dlmfs_mkdir(struct user_namespace * mnt_userns, + struct inode * dir, struct dentry * dentry, umode_t mode) { @@ -443,7 +445,8 @@ bail: return status; } -static int dlmfs_create(struct inode *dir, +static int dlmfs_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index df6d709d2ae3..6611c64ca0be 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1112,7 +1112,8 @@ out: return ret; } -int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) +int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { int status = 0, size_change; int inode_locked = 0; @@ -1142,7 +1143,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) return 0; - status = setattr_prepare(dentry, attr); + status = setattr_prepare(&init_user_ns, dentry, attr); if (status) return status; @@ -1263,7 +1264,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); @@ -1298,8 +1299,8 @@ bail: return status; } -int ocfs2_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct super_block *sb = path->dentry->d_sb; @@ -1313,7 +1314,7 @@ int ocfs2_getattr(const struct path *path, struct kstat *stat, goto bail; } - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). @@ -1330,7 +1331,8 @@ bail: return err; } -int ocfs2_permission(struct inode *inode, int mask) +int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { int ret, had_lock; struct ocfs2_lock_holder oh; @@ -1355,7 +1357,7 @@ int ocfs2_permission(struct inode *inode, int mask) dump_stack(); } - ret = generic_permission(inode, mask); + ret = generic_permission(&init_user_ns, inode, mask); ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); out: diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 4832cbceba5b..8536cec5f122 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -51,10 +51,13 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); -int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); -int ocfs2_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); -int ocfs2_permission(struct inode *inode, int mask); +int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); +int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +int ocfs2_permission(struct user_namespace *mnt_userns, + struct inode *inode, + int mask); int ocfs2_should_update_atime(struct inode *inode, struct vfsmount *vfsmnt); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 89984172fc4a..50c9b30ee9f6 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -96,7 +96,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, } status = -EACCES; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) goto bail_unlock; if (!S_ISDIR(inode->i_mode)) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2a237ab00453..3abdd36da2e2 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -198,7 +198,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) * callers. */ if (S_ISDIR(mode)) set_nlink(inode, 2); - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); status = dquot_initialize(inode); if (status) return ERR_PTR(status); @@ -221,7 +221,8 @@ static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, iput(inode); } -static int ocfs2_mknod(struct inode *dir, +static int ocfs2_mknod(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) @@ -645,7 +646,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, return status; } -static int ocfs2_mkdir(struct inode *dir, +static int ocfs2_mkdir(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -653,14 +655,15 @@ static int ocfs2_mkdir(struct inode *dir, trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name, OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); + ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); if (ret) mlog_errno(ret); return ret; } -static int ocfs2_create(struct inode *dir, +static int ocfs2_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) @@ -669,7 +672,7 @@ static int ocfs2_create(struct inode *dir, trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); + ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); if (ret) mlog_errno(ret); @@ -1195,7 +1198,8 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) ocfs2_inode_unlock(inode2, 1); } -static int ocfs2_rename(struct inode *old_dir, +static int ocfs2_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, @@ -1784,7 +1788,8 @@ bail: return status; } -static int ocfs2_symlink(struct inode *dir, +static int ocfs2_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, const char *symname) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3b397fa9c9e8..c19a463fac55 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -978,7 +978,7 @@ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, return 0; } - if (!eb || (eb && !eb->h_next_leaf_blk)) { + if (!eb || !eb->h_next_leaf_blk) { /* * We are the last extent rec, so any high cpos should * be stored in this leaf refcount block. @@ -4346,7 +4346,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(dir, MAY_WRITE | MAY_EXEC); + return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); } /** @@ -4400,7 +4400,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, * file. */ if (!preserve) { - error = inode_permission(inode, MAY_READ); + error = inode_permission(&init_user_ns, inode, MAY_READ); if (error) return error; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2febc76e9de7..079f8826993e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -973,8 +973,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) * quota files */ dquot_disable(sb, type, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - if (!inode) - continue; iput(inode); } } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 9ccd19d8f7b1..36ae47a4aef6 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7249,6 +7249,7 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler, } static int ocfs2_xattr_security_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7321,6 +7322,7 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, } static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7351,6 +7353,7 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, } static int ocfs2_xattr_user_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index a0f45651f3b7..c219f91f44e9 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -279,13 +279,14 @@ out_free_inode: return err; } -static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int omfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { return omfs_add_node(dir, dentry, mode | S_IFDIR); } -static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int omfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { return omfs_add_node(dir, dentry, mode | S_IFREG); } @@ -369,9 +370,9 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, return true; } -static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int omfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 2c7b70ee1388..11e733aab25d 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -343,12 +343,13 @@ const struct file_operations omfs_file_operations = { .splice_read = generic_file_splice_read, }; -static int omfs_setattr(struct dentry *dentry, struct iattr *attr) +static int omfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -361,7 +362,7 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr) omfs_truncate(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index ce93ccca8639..2a0e83236c01 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -48,7 +48,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode) goto fail; inode->i_ino = new_block; - inode_init_owner(inode, NULL, mode); + inode_init_owner(&init_user_ns, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); diff --git a/fs/open.c b/fs/open.c index ca5444733acd..e53af13b5835 100644 --- a/fs/open.c +++ b/fs/open.c @@ -35,8 +35,8 @@ #include "internal.h" -int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, - struct file *filp) +int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, + loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; @@ -61,13 +61,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ - ret = notify_change(dentry, &newattrs, NULL); + ret = notify_change(mnt_userns, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } long vfs_truncate(const struct path *path, loff_t length) { + struct user_namespace *mnt_userns; struct inode *inode; long error; @@ -83,7 +84,8 @@ long vfs_truncate(const struct path *path, loff_t length) if (error) goto out; - error = inode_permission(inode, MAY_WRITE); + mnt_userns = mnt_user_ns(path->mnt); + error = inode_permission(mnt_userns, inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; @@ -107,7 +109,7 @@ long vfs_truncate(const struct path *path, loff_t length) if (!error) error = security_path_truncate(path); if (!error) - error = do_truncate(path->dentry, length, 0, NULL); + error = do_truncate(mnt_userns, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); @@ -186,13 +188,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) /* Check IS_APPEND on real upper inode */ if (IS_APPEND(file_inode(f.file))) goto out_putf; - sb_start_write(inode->i_sb); error = locks_verify_truncate(inode, f.file, length); if (!error) error = security_path_truncate(&f.file->f_path); if (!error) - error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file); + error = do_truncate(file_mnt_user_ns(f.file), dentry, length, + ATTR_MTIME | ATTR_CTIME, f.file); sb_end_write(inode->i_sb); out_putf: fdput(f); @@ -436,7 +438,7 @@ retry: goto out_path_release; } - res = inode_permission(inode, mode | MAY_ACCESS); + res = inode_permission(mnt_user_ns(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; @@ -492,7 +494,7 @@ retry: if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; @@ -521,7 +523,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd) if (!d_can_lookup(f.file->f_path.dentry)) goto out_putf; - error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR); + error = file_permission(f.file, MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &f.file->f_path); out_putf: @@ -540,7 +542,7 @@ retry: if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; @@ -580,7 +582,8 @@ retry_deleg: goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path->dentry, &newattrs, &delegated_inode); + error = notify_change(mnt_user_ns(path->mnt), path->dentry, + &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); if (delegated_inode) { @@ -641,6 +644,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) int chown_common(const struct path *path, uid_t user, gid_t group) { + struct user_namespace *mnt_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; @@ -651,6 +655,10 @@ int chown_common(const struct path *path, uid_t user, gid_t group) uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); + mnt_userns = mnt_user_ns(path->mnt); + uid = kuid_from_mnt(mnt_userns, uid); + gid = kgid_from_mnt(mnt_userns, gid); + retry_deleg: newattrs.ia_valid = ATTR_CTIME; if (user != (uid_t) -1) { @@ -671,7 +679,8 @@ retry_deleg: inode_lock(inode); error = security_path_chown(path, uid, gid); if (!error) - error = notify_change(path->dentry, &newattrs, &delegated_inode); + error = notify_change(mnt_userns, path->dentry, &newattrs, + &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index a25e6c890975..18852b9ed82b 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -116,7 +116,8 @@ out: return error; } -int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int error; struct iattr iattr; @@ -132,7 +133,8 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) * and "mode" to the new desired value. It is up to * us to propagate the new mode back to the server... */ - error = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); + error = posix_acl_update_mode(&init_user_ns, inode, + &iattr.ia_mode, &acl); if (error) { gossip_err("%s: posix_acl_update_mode err: %d\n", __func__, diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 48f0547d4850..5079cfafa8d7 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -855,13 +855,13 @@ again: ORANGEFS_I(inode)->attr_uid = current_fsuid(); ORANGEFS_I(inode)->attr_gid = current_fsgid(); } - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); spin_unlock(&inode->i_lock); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) /* change mod on a file that has ACLs */ - ret = posix_acl_chmod(inode, inode->i_mode); + ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); ret = 0; out: @@ -871,12 +871,13 @@ out: /* * Change attributes of an object referenced by dentry. */ -int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) +int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { int ret; gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n", dentry); - ret = setattr_prepare(dentry, iattr); + ret = setattr_prepare(&init_user_ns, dentry, iattr); if (ret) goto out; ret = __orangefs_setattr(d_inode(dentry), iattr); @@ -890,8 +891,8 @@ out: /* * Obtain attributes of an object given a dentry */ -int orangefs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { int ret; struct inode *inode = path->dentry->d_inode; @@ -903,7 +904,7 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, ret = orangefs_inode_getattr(inode, request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); if (ret == 0) { - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); /* override block size reported to stat */ if (!(request_mask & STATX_SIZE)) @@ -919,7 +920,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, return ret; } -int orangefs_permission(struct inode *inode, int mask) +int orangefs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { int ret; @@ -933,7 +935,7 @@ int orangefs_permission(struct inode *inode, int mask) if (ret < 0) return ret; - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags) diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 3e7cf3d0a494..600e8eee541f 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -15,7 +15,8 @@ /* * Get a newly allocated inode to go with a negative dentry. */ -static int orangefs_create(struct inode *dir, +static int orangefs_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode, bool exclusive) @@ -215,7 +216,8 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) return ret; } -static int orangefs_symlink(struct inode *dir, +static int orangefs_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, const char *symname) { @@ -303,7 +305,8 @@ out: return ret; } -static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int orangefs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; @@ -372,7 +375,8 @@ out: return ret; } -static int orangefs_rename(struct inode *old_dir, +static int orangefs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index e12aeb9623d6..0e6b97682e41 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -107,7 +107,9 @@ extern int orangefs_init_acl(struct inode *inode, struct inode *dir); extern const struct xattr_handler *orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type); -extern int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int orangefs_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, struct posix_acl *acl, + int type); /* * orangefs data structures @@ -359,12 +361,13 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct orangefs_object_kref *ref); int __orangefs_setattr(struct inode *, struct iattr *); -int orangefs_setattr(struct dentry *, struct iattr *); +int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *); -int orangefs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); +int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); -int orangefs_permission(struct inode *inode, int mask); +int orangefs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); int orangefs_update_time(struct inode *, struct timespec64 *, int); diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index bdc285aea360..9a5b757fbd2f 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -526,6 +526,7 @@ out_unlock: } static int orangefs_xattr_set_default(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 0fed532efa68..0b2891c6c71e 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -93,9 +93,9 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old, continue; /* Discard */ } retry: - size = vfs_getxattr(old, name, value, value_size); + size = vfs_getxattr(&init_user_ns, old, name, value, value_size); if (size == -ERANGE) - size = vfs_getxattr(old, name, NULL, 0); + size = vfs_getxattr(&init_user_ns, old, name, NULL, 0); if (size < 0) { error = size; @@ -115,7 +115,7 @@ retry: goto retry; } - error = vfs_setxattr(new, name, value, size, 0); + error = vfs_setxattr(&init_user_ns, new, name, value, size, 0); if (error) { if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name)) break; @@ -236,7 +236,7 @@ static int ovl_set_size(struct dentry *upperdentry, struct kstat *stat) .ia_size = stat->size, }; - return notify_change(upperdentry, &attr, NULL); + return notify_change(&init_user_ns, upperdentry, &attr, NULL); } static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) @@ -248,7 +248,7 @@ static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) .ia_mtime = stat->mtime, }; - return notify_change(upperdentry, &attr, NULL); + return notify_change(&init_user_ns, upperdentry, &attr, NULL); } int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) @@ -260,7 +260,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) .ia_valid = ATTR_MODE, .ia_mode = stat->mode, }; - err = notify_change(upperdentry, &attr, NULL); + err = notify_change(&init_user_ns, upperdentry, &attr, NULL); } if (!err) { struct iattr attr = { @@ -268,7 +268,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) .ia_uid = stat->uid, .ia_gid = stat->gid, }; - err = notify_change(upperdentry, &attr, NULL); + err = notify_change(&init_user_ns, upperdentry, &attr, NULL); } if (!err) ovl_set_timestamps(upperdentry, stat); @@ -796,7 +796,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value) ssize_t res; char *buf; - res = vfs_getxattr(dentry, name, NULL, 0); + res = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0); if (res == -ENODATA || res == -EOPNOTSUPP) res = 0; @@ -805,7 +805,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value) if (!buf) return -ENOMEM; - res = vfs_getxattr(dentry, name, buf, res); + res = vfs_getxattr(&init_user_ns, dentry, name, buf, res); if (res < 0) kfree(buf); else @@ -847,8 +847,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) * don't want that to happen for normal copy-up operation. */ if (capability) { - err = vfs_setxattr(upperpath.dentry, XATTR_NAME_CAPS, - capability, cap_size, 0); + err = vfs_setxattr(&init_user_ns, upperpath.dentry, + XATTR_NAME_CAPS, capability, cap_size, 0); if (err) goto out_free; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index d1efa3a5a503..836f14b9d3a6 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -449,7 +449,7 @@ static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, if (err < 0) goto out_free; - err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE); + err = vfs_setxattr(&init_user_ns, upperdentry, name, buffer, size, XATTR_CREATE); out_free: kfree(buffer); return err; @@ -508,7 +508,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, .ia_mode = cattr->mode, }; inode_lock(newdentry->d_inode); - err = notify_change(newdentry, &attr, NULL); + err = notify_change(&init_user_ns, newdentry, &attr, NULL); inode_unlock(newdentry->d_inode); if (err) goto out_cleanup; @@ -636,7 +636,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, inode->i_state |= I_CREATING; spin_unlock(&inode->i_lock); - inode_init_owner(inode, dentry->d_parent->d_inode, mode); + inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode); attr.mode = inode->i_mode; err = ovl_create_or_link(dentry, inode, &attr, false); @@ -650,19 +650,20 @@ out: return err; } -static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int ovl_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); } -static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ovl_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); } -static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) +static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { /* Don't allow creation of "whiteout" on overlay */ if (S_ISCHR(mode) && rdev == WHITEOUT_DEV) @@ -671,8 +672,8 @@ static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return ovl_create_object(dentry, mode, rdev, NULL); } -static int ovl_symlink(struct inode *dir, struct dentry *dentry, - const char *link) +static int ovl_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *link) { return ovl_create_object(dentry, S_IFLNK, 0, link); } @@ -821,9 +822,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, goto out_dput_upper; if (is_dir) - err = vfs_rmdir(dir, upper); + err = vfs_rmdir(&init_user_ns, dir, upper); else - err = vfs_unlink(dir, upper, NULL); + err = vfs_unlink(&init_user_ns, dir, upper, NULL); ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry)); /* @@ -1069,9 +1070,9 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) return err; } -static int ovl_rename(struct inode *olddir, struct dentry *old, - struct inode *newdir, struct dentry *new, - unsigned int flags) +static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, + struct dentry *old, struct inode *newdir, + struct dentry *new, unsigned int flags) { int err; struct dentry *old_upperdir; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 077d3ad343f6..dbfb35fb0ff7 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -50,11 +50,11 @@ static struct file *ovl_open_realfile(const struct file *file, acc_mode |= MAY_APPEND; old_cred = ovl_override_creds(inode->i_sb); - err = inode_permission(realinode, MAY_OPEN | acc_mode); + err = inode_permission(&init_user_ns, realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); } else { - if (!inode_owner_or_capable(realinode)) + if (!inode_owner_or_capable(&init_user_ns, realinode)) flags &= ~O_NOATIME; realfile = open_with_fake_path(&file->f_path, flags, realinode, @@ -521,7 +521,7 @@ static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd, long ret; struct inode *inode = file_inode(file); - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; ret = mnt_want_write_file(file); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index cf41bcb664bc..003cf83bf78a 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -14,14 +14,15 @@ #include "overlayfs.h" -int ovl_setattr(struct dentry *dentry, struct iattr *attr) +int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { int err; bool full_copy_up = false; struct dentry *upperdentry; const struct cred *old_cred; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; @@ -79,7 +80,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); - err = notify_change(upperdentry, attr, NULL); + err = notify_change(&init_user_ns, upperdentry, attr, NULL); revert_creds(old_cred); if (!err) ovl_copyattr(upperdentry->d_inode, dentry->d_inode); @@ -154,8 +155,8 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) return 0; } -int ovl_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; enum ovl_path_type type; @@ -277,7 +278,8 @@ out: return err; } -int ovl_permission(struct inode *inode, int mask) +int ovl_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct inode *upperinode = ovl_inode_upper(inode); struct inode *realinode = upperinode ?: ovl_inode_lower(inode); @@ -294,7 +296,7 @@ int ovl_permission(struct inode *inode, int mask) * Check overlay inode with the creds of task and underlying inode * with creds of mounter */ - err = generic_permission(inode, mask); + err = generic_permission(&init_user_ns, inode, mask); if (err) return err; @@ -305,7 +307,7 @@ int ovl_permission(struct inode *inode, int mask) /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } - err = inode_permission(realinode, mask); + err = inode_permission(&init_user_ns, realinode, mask); revert_creds(old_cred); return err; @@ -353,7 +355,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, if (!value && !upperdentry) { old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getxattr(realdentry, name, NULL, 0); + err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0); revert_creds(old_cred); if (err < 0) goto out_drop_write; @@ -369,10 +371,11 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, old_cred = ovl_override_creds(dentry->d_sb); if (value) - err = vfs_setxattr(realdentry, name, value, size, flags); + err = vfs_setxattr(&init_user_ns, realdentry, name, value, size, + flags); else { WARN_ON(flags != XATTR_REPLACE); - err = vfs_removexattr(realdentry, name); + err = vfs_removexattr(&init_user_ns, realdentry, name); } revert_creds(old_cred); @@ -394,7 +397,7 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry); old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_getxattr(realdentry, name, value, size); + res = vfs_getxattr(&init_user_ns, realdentry, name, value, size); revert_creds(old_cred); return res; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index cb4e2d60ecf9..95cff83786a5 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -123,7 +123,7 @@ static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { - int err = vfs_rmdir(dir, dentry); + int err = vfs_rmdir(&init_user_ns, dir, dentry); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; @@ -131,7 +131,7 @@ static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) { - int err = vfs_unlink(dir, dentry, NULL); + int err = vfs_unlink(&init_user_ns, dir, dentry, NULL); pr_debug("unlink(%pd2) = %i\n", dentry, err); return err; @@ -140,7 +140,7 @@ static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { - int err = vfs_link(old_dentry, dir, new_dentry, NULL); + int err = vfs_link(old_dentry, &init_user_ns, dir, new_dentry, NULL); pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); return err; @@ -149,7 +149,7 @@ static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_create(dir, dentry, mode, true); + int err = vfs_create(&init_user_ns, dir, dentry, mode, true); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; @@ -158,7 +158,7 @@ static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_mkdir(dir, dentry, mode); + int err = vfs_mkdir(&init_user_ns, dir, dentry, mode); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } @@ -166,7 +166,7 @@ static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int err = vfs_mknod(dir, dentry, mode, dev); + int err = vfs_mknod(&init_user_ns, dir, dentry, mode, dev); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; @@ -175,7 +175,7 @@ static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) { - int err = vfs_symlink(dir, dentry, oldname); + int err = vfs_symlink(&init_user_ns, dir, dentry, oldname); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; @@ -186,7 +186,7 @@ static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry, size_t size) { const char *name = ovl_xattr(ofs, ox); - return vfs_getxattr(dentry, name, value, size); + return vfs_getxattr(&init_user_ns, dentry, name, value, size); } static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, @@ -194,7 +194,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, size_t size) { const char *name = ovl_xattr(ofs, ox); - int err = vfs_setxattr(dentry, name, value, size, 0); + int err = vfs_setxattr(&init_user_ns, dentry, name, value, size, 0); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", dentry, name, min((int)size, 48), value, size, err); return err; @@ -204,7 +204,7 @@ static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox) { const char *name = ovl_xattr(ofs, ox); - int err = vfs_removexattr(dentry, name); + int err = vfs_removexattr(&init_user_ns, dentry, name); pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); return err; } @@ -214,9 +214,18 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, unsigned int flags) { int err; + struct renamedata rd = { + .old_mnt_userns = &init_user_ns, + .old_dir = olddir, + .old_dentry = olddentry, + .new_mnt_userns = &init_user_ns, + .new_dir = newdir, + .new_dentry = newdentry, + .flags = flags, + }; pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); - err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags); + err = vfs_rename(&rd); if (err) { pr_debug("...rename(%pd2, %pd2, ...) = %i\n", olddentry, newdentry, err); @@ -226,14 +235,14 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) { - int err = vfs_whiteout(dir, dentry); + int err = vfs_whiteout(&init_user_ns, dir, dentry); pr_debug("whiteout(%pd2) = %i\n", dentry, err); return err; } static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode) { - struct dentry *ret = vfs_tmpfile(dentry, mode, 0); + struct dentry *ret = vfs_tmpfile(&init_user_ns, dentry, mode, 0); int err = PTR_ERR_OR_ZERO(ret); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); @@ -436,10 +445,12 @@ int ovl_set_nlink_lower(struct dentry *dentry); unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback); -int ovl_setattr(struct dentry *dentry, struct iattr *attr); -int ovl_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); -int ovl_permission(struct inode *inode, int mask); +int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); +int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask); int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags); int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d58b8f2bf9d0..fdd72f1a9c5e 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -803,17 +803,19 @@ retry: * allowed as upper are limited to "normal" ones, where checking * for the above two errors is sufficient. */ - err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); + err = vfs_removexattr(&init_user_ns, work, + XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; - err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS); + err = vfs_removexattr(&init_user_ns, work, + XATTR_NAME_POSIX_ACL_ACCESS); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; /* Clear any inherited mode bits */ inode_lock(work->d_inode); - err = notify_change(work, &attr, NULL); + err = notify_change(&init_user_ns, work, &attr, NULL); inode_unlock(work->d_inode); if (err) goto out_dput; @@ -865,6 +867,10 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) pr_err("filesystem on '%s' not supported\n", name); goto out_put; } + if (mnt_user_ns(path->mnt) != &init_user_ns) { + pr_err("idmapped layers are currently not supported\n"); + goto out_put; + } if (!d_is_dir(path->dentry)) { pr_err("'%s' not a directory\n", name); goto out_put; @@ -989,6 +995,7 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler, static int __maybe_unused ovl_posix_acl_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -1014,7 +1021,7 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, goto out_acl_release; } err = -EPERM; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) goto out_acl_release; posix_acl_release(acl); @@ -1026,10 +1033,10 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, if (unlikely(inode->i_mode & S_ISGID) && handler->flags == ACL_TYPE_ACCESS && !in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) { + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; - err = ovl_setattr(dentry, &iattr); + err = ovl_setattr(&init_user_ns, dentry, &iattr); if (err) return err; } @@ -1053,6 +1060,7 @@ static int ovl_own_xattr_get(const struct xattr_handler *handler, } static int ovl_own_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -1068,6 +1076,7 @@ static int ovl_other_xattr_get(const struct xattr_handler *handler, } static int ovl_other_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 9826b003f1d2..7f5a01a11f97 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -479,12 +479,12 @@ struct file *ovl_path_open(struct path *path, int flags) BUG(); } - err = inode_permission(inode, acc_mode | MAY_OPEN); + err = inode_permission(&init_user_ns, inode, acc_mode | MAY_OPEN); if (err) return ERR_PTR(err); /* O_NOATIME is an optimization, don't fail if not permitted */ - if (inode_owner_or_capable(inode)) + if (inode_owner_or_capable(&init_user_ns, inode)) flags |= O_NOATIME; return dentry_open(path, flags, current_cred()); diff --git a/fs/pipe.c b/fs/pipe.c index 39c96845a72f..bfd946a9ad01 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal); * * Description: * This function grabs an extra reference to @buf. It's used in - * in the tee() system call, when we duplicate the buffers in one + * the tee() system call, when we duplicate the buffers in one * pipe into another. */ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 95882b3f5f62..f3309a7edb49 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -345,10 +345,13 @@ EXPORT_SYMBOL(posix_acl_from_mode); * by the acl. Returns -E... otherwise. */ int -posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) +posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, + const struct posix_acl *acl, int want) { const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; + kuid_t uid; + kgid_t gid; want &= MAY_READ | MAY_WRITE | MAY_EXEC; @@ -356,22 +359,26 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - if (uid_eq(inode->i_uid, current_fsuid())) + uid = i_uid_into_mnt(mnt_userns, inode); + if (uid_eq(uid, current_fsuid())) goto check_perm; break; case ACL_USER: - if (uid_eq(pa->e_uid, current_fsuid())) + uid = kuid_into_mnt(mnt_userns, pa->e_uid); + if (uid_eq(uid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: - if (in_group_p(inode->i_gid)) { + gid = i_gid_into_mnt(mnt_userns, inode); + if (in_group_p(gid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_GROUP: - if (in_group_p(pa->e_gid)) { + gid = kgid_into_mnt(mnt_userns, pa->e_gid); + if (in_group_p(gid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; @@ -551,8 +558,22 @@ __posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) } EXPORT_SYMBOL(__posix_acl_chmod); +/** + * posix_acl_chmod - chmod a posix acl + * + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check permissions on + * @mode: the new mode of @inode + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + */ int -posix_acl_chmod(struct inode *inode, umode_t mode) + posix_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode, + umode_t mode) { struct posix_acl *acl; int ret = 0; @@ -572,7 +593,7 @@ posix_acl_chmod(struct inode *inode, umode_t mode) ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) return ret; - ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS); + ret = inode->i_op->set_acl(mnt_userns, inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); return ret; } @@ -631,9 +652,10 @@ EXPORT_SYMBOL_GPL(posix_acl_create); /** * posix_acl_update_mode - update mode in set_acl - * @inode: target inode - * @mode_p: mode (pointer) for update - * @acl: acl pointer + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: target inode + * @mode_p: mode (pointer) for update + * @acl: acl pointer * * Update the file mode when setting an ACL: compute the new file permission * bits based on the ACL. In addition, if the ACL is equivalent to the new @@ -642,9 +664,16 @@ EXPORT_SYMBOL_GPL(posix_acl_create); * As with chmod, clear the setgid bit if the caller is not in the owning group * or capable of CAP_FSETID (see inode_change_ok). * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + * * Called from set_acl inode operations. */ -int posix_acl_update_mode(struct inode *inode, umode_t *mode_p, +int posix_acl_update_mode(struct user_namespace *mnt_userns, + struct inode *inode, umode_t *mode_p, struct posix_acl **acl) { umode_t mode = inode->i_mode; @@ -655,8 +684,8 @@ int posix_acl_update_mode(struct inode *inode, umode_t *mode_p, return error; if (error == 0) *acl = NULL; - if (!in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; @@ -668,7 +697,8 @@ EXPORT_SYMBOL(posix_acl_update_mode); */ static void posix_acl_fix_xattr_userns( struct user_namespace *to, struct user_namespace *from, - void *value, size_t size) + struct user_namespace *mnt_userns, + void *value, size_t size, bool from_user) { struct posix_acl_xattr_header *header = value; struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; @@ -693,10 +723,18 @@ static void posix_acl_fix_xattr_userns( switch(le16_to_cpu(entry->e_tag)) { case ACL_USER: uid = make_kuid(from, le32_to_cpu(entry->e_id)); + if (from_user) + uid = kuid_from_mnt(mnt_userns, uid); + else + uid = kuid_into_mnt(mnt_userns, uid); entry->e_id = cpu_to_le32(from_kuid(to, uid)); break; case ACL_GROUP: gid = make_kgid(from, le32_to_cpu(entry->e_id)); + if (from_user) + gid = kgid_from_mnt(mnt_userns, gid); + else + gid = kgid_into_mnt(mnt_userns, gid); entry->e_id = cpu_to_le32(from_kgid(to, gid)); break; default: @@ -705,20 +743,24 @@ static void posix_acl_fix_xattr_userns( } } -void posix_acl_fix_xattr_from_user(void *value, size_t size) +void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, + void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) + if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value, + size, true); } -void posix_acl_fix_xattr_to_user(void *value, size_t size) +void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, + void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) + if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value, + size, false); } /* @@ -858,7 +900,8 @@ posix_acl_xattr_get(const struct xattr_handler *handler, } int -set_posix_acl(struct inode *inode, int type, struct posix_acl *acl) +set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode, + int type, struct posix_acl *acl) { if (!IS_POSIXACL(inode)) return -EOPNOTSUPP; @@ -867,7 +910,7 @@ set_posix_acl(struct inode *inode, int type, struct posix_acl *acl) if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; if (acl) { @@ -875,15 +918,16 @@ set_posix_acl(struct inode *inode, int type, struct posix_acl *acl) if (ret) return ret; } - return inode->i_op->set_acl(inode, acl, type); + return inode->i_op->set_acl(mnt_userns, inode, acl, type); } EXPORT_SYMBOL(set_posix_acl); static int posix_acl_xattr_set(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) { struct posix_acl *acl = NULL; int ret; @@ -893,7 +937,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler, if (IS_ERR(acl)) return PTR_ERR(acl); } - ret = set_posix_acl(inode, handler->flags, acl); + ret = set_posix_acl(mnt_userns, inode, handler->flags, acl); posix_acl_release(acl); return ret; } @@ -922,12 +966,13 @@ const struct xattr_handler posix_acl_default_xattr_handler = { }; EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); -int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_update_mode(inode, + error = posix_acl_update_mode(mnt_userns, inode, &inode->i_mode, &acl); if (error) return error; diff --git a/fs/proc/base.c b/fs/proc/base.c index b3422cda2a91..3851bfcdba56 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -67,7 +67,6 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/rcupdate.h> -#include <linux/kallsyms.h> #include <linux/stacktrace.h> #include <linux/resource.h> #include <linux/module.h> @@ -386,19 +385,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; - char symname[KSYM_NAME_LEN]; - if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - goto print0; + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + wchan = get_wchan(task); + else + wchan = 0; - wchan = get_wchan(task); - if (wchan && !lookup_symbol_name(wchan, symname)) { - seq_puts(m, symname); - return 0; - } + if (wchan) + seq_printf(m, "%ps", (void *) wchan); + else + seq_putc(m, '0'); -print0: - seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ @@ -685,7 +682,8 @@ static int proc_fd_access_allowed(struct inode *inode) return allowed; } -int proc_setattr(struct dentry *dentry, struct iattr *attr) +int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); @@ -693,11 +691,11 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) return -EPERM; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } @@ -726,7 +724,8 @@ static bool has_pid_permissions(struct proc_fs_info *fs_info, } -static int proc_pid_permission(struct inode *inode, int mask) +static int proc_pid_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; @@ -751,7 +750,7 @@ static int proc_pid_permission(struct inode *inode, int mask) return -EPERM; } - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } @@ -1927,14 +1926,14 @@ out_unlock: return NULL; } -int pid_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int pid_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; @@ -3473,7 +3472,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) * This function makes sure that the node is always accessible for members of * same thread group. */ -static int proc_tid_comm_permission(struct inode *inode, int mask) +static int proc_tid_comm_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { bool is_same_tgroup; struct task_struct *task; @@ -3492,7 +3492,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask) return 0; } - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } static const struct inode_operations proc_tid_comm_inode_operations = { @@ -3798,12 +3798,13 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) return 0; } -static int proc_task_getattr(const struct path *path, struct kstat *stat, +static int proc_task_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (p) { stat->nlink += get_nr_threads(p); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index cb51763ed554..07fc4fad2602 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -276,12 +276,13 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -int proc_fd_permission(struct inode *inode, int mask) +int proc_fd_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct task_struct *p; int rv; - rv = generic_permission(inode, mask); + rv = generic_permission(&init_user_ns, inode, mask); if (rv == 0) return rv; diff --git a/fs/proc/fd.h b/fs/proc/fd.h index f371a602bf58..c5a921a06a0b 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -10,7 +10,8 @@ extern const struct inode_operations proc_fd_inode_operations; extern const struct file_operations proc_fdinfo_operations; extern const struct inode_operations proc_fdinfo_inode_operations; -extern int proc_fd_permission(struct inode *inode, int mask); +extern int proc_fd_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); static inline unsigned int proc_fd(struct inode *inode) { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 6c0a05f55d6b..bc86aa87cc41 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -115,17 +115,18 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, return true; } -static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) +static int proc_notify_change(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); int error; - error = setattr_prepare(dentry, iattr); + error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) return error; - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); mark_inode_dirty(inode); proc_set_user(de, inode->i_uid, inode->i_gid); @@ -133,7 +134,8 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) return 0; } -static int proc_getattr(const struct path *path, struct kstat *stat, +static int proc_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -145,7 +147,7 @@ static int proc_getattr(const struct path *path, struct kstat *stat, } } - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index f60b379dcdc7..03415f3fb3a8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -162,8 +162,10 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, * base.c */ extern const struct dentry_operations pid_dentry_operations; -extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); -extern int proc_setattr(struct dentry *, struct iattr *); +extern int pid_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); +extern int proc_setattr(struct user_namespace *, struct dentry *, + struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d6fc74619625..6fa761c9cc78 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -129,15 +129,15 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_TRANSPARENT_HUGEPAGE show_val_kb(m, "AnonHugePages: ", - global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_ANON_THPS)); show_val_kb(m, "ShmemHugePages: ", - global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_SHMEM_THPS)); show_val_kb(m, "ShmemPmdMapped: ", - global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); + global_node_page_state(NR_SHMEM_PMDMAPPED)); show_val_kb(m, "FileHugePages: ", - global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_FILE_THPS)); show_val_kb(m, "FilePmdMapped: ", - global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); + global_node_page_state(NR_FILE_PMDMAPPED)); #endif #ifdef CONFIG_CMA diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 18601042af99..15c2e55d2ed2 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -289,7 +289,8 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, return de; } -static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat, +static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -297,7 +298,7 @@ static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat, net = get_proc_task_net(inode); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d2018f70d1fa..984e42f8cb11 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -571,7 +571,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, error = -ENOMEM; if (count >= KMALLOC_MAX_SIZE) goto out; - kbuf = kzalloc(count + 1, GFP_KERNEL); + kbuf = kvzalloc(count + 1, GFP_KERNEL); if (!kbuf) goto out; @@ -600,7 +600,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, error = count; out_free_buf: - kfree(kbuf); + kvfree(kbuf); out: sysctl_head_finish(head); @@ -785,7 +785,8 @@ out: return 0; } -static int proc_sys_permission(struct inode *inode, int mask) +static int proc_sys_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { /* * sysctl entries that are not writeable, @@ -813,7 +814,8 @@ static int proc_sys_permission(struct inode *inode, int mask) return error; } -static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) +static int proc_sys_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; @@ -821,16 +823,17 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } -static int proc_sys_getattr(const struct path *path, struct kstat *stat, +static int proc_sys_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -840,7 +843,7 @@ static int proc_sys_getattr(const struct path *path, struct kstat *stat, if (IS_ERR(head)) return PTR_ERR(head); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; diff --git a/fs/proc/root.c b/fs/proc/root.c index 5e444d4f9717..c7e3b1350ef8 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -308,10 +308,11 @@ void __init proc_root_init(void) register_filesystem(&proc_fs_type); } -static int proc_root_getattr(const struct path *path, struct kstat *stat, +static int proc_root_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - generic_fillattr(d_inode(path->dentry), stat); + generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/self.c b/fs/proc/self.c index a4012154e109..72cd69bcaf4a 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -16,13 +16,6 @@ static const char *proc_self_get_link(struct dentry *dentry, pid_t tgid = task_tgid_nr_ns(current, ns); char *name; - /* - * Not currently supported. Once we can inherit all of struct pid, - * we can allow this. - */ - if (current->flags & PF_IO_WORKER) - return ERR_PTR(-EOPNOTSUPP); - if (!tgid) return ERR_PTR(-ENOENT); /* max length of unsigned int in decimal + NULL term */ diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index d56681d86d28..a553273fbd41 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -17,13 +17,6 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, pid_t pid = task_pid_nr_ns(current, ns); char *name; - /* - * Not currently supported. Once we can inherit all of struct pid, - * we can allow this. - */ - if (current->flags & PF_IO_WORKER) - return ERR_PTR(-EOPNOTSUPP); - if (!pid) return ERR_PTR(-ENOENT); name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index c3a345c28a93..9a15334da208 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1503,11 +1503,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) return 0; out_err: - if (buf) - vfree(buf); - - if (dump) - vfree(dump); + vfree(buf); + vfree(dump); return ret; } diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index eafb75755fa3..392ef5162655 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -79,6 +79,9 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) if (mnt->mnt_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } + + if (mnt_user_ns(mnt) != &init_user_ns) + seq_puts(m, ",idmapped"); } static inline void mangle(struct seq_file *m, const char *s) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 93a217e4f563..14658b009f1b 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -467,7 +467,7 @@ static struct dentry *pstore_mount(struct file_system_type *fs_type, static void pstore_kill_sb(struct super_block *sb) { mutex_lock(&pstore_sb_lock); - WARN_ON(pstore_sb != sb); + WARN_ON(pstore_sb && pstore_sb != sb); kill_litter_super(sb); pstore_sb = NULL; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index aa8e0b65ff1a..fff363bfd484 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -246,7 +246,7 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, pr_info("error in header, %d\n", numerr); prz->corrected_bytes += numerr; } else if (numerr < 0) { - pr_info("uncorrectable error in header\n"); + pr_info_ratelimited("uncorrectable error in header\n"); prz->bad_blocks++; } diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 355523f4a4bf..ba3525ccc27e 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -22,7 +22,7 @@ #include <linux/uaccess.h> #include "internal.h" -static int ramfs_nommu_setattr(struct dentry *, struct iattr *); +static int ramfs_nommu_setattr(struct user_namespace *, struct dentry *, struct iattr *); static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, @@ -158,14 +158,15 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) * handle a change of attributes * - we're specifically interested in a change of size */ -static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) +static int ramfs_nommu_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *ia) { struct inode *inode = d_inode(dentry); unsigned int old_ia_valid = ia->ia_valid; int ret = 0; /* POSIX UID/GID verification for setting inode attributes */ - ret = setattr_prepare(dentry, ia); + ret = setattr_prepare(&init_user_ns, dentry, ia); if (ret) return ret; @@ -185,7 +186,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) } } - setattr_copy(inode, ia); + setattr_copy(&init_user_ns, inode, ia); out: ia->ia_valid = old_ia_valid; return ret; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index ee179a81b3da..9ebd17d7befb 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -67,7 +67,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_mapping->a_ops = &ramfs_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); @@ -101,7 +101,8 @@ struct inode *ramfs_get_inode(struct super_block *sb, */ /* SMP-safe */ static int -ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +ramfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); int error = -ENOSPC; @@ -115,20 +116,23 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) return error; } -static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) +static int ramfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { - int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0); + int retval = ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } -static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +static int ramfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return ramfs_mknod(dir, dentry, mode | S_IFREG, 0); + return ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); } -static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname) +static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct inode *inode; int error = -ENOSPC; @@ -147,6 +151,18 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * return error; } +static int ramfs_tmpfile(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); + if (!inode) + return -ENOSPC; + d_tmpfile(dentry, inode); + return 0; +} + static const struct inode_operations ramfs_dir_inode_operations = { .create = ramfs_create, .lookup = simple_lookup, @@ -157,6 +173,7 @@ static const struct inode_operations ramfs_dir_inode_operations = { .rmdir = simple_rmdir, .mknod = ramfs_mknod, .rename = simple_rename, + .tmpfile = ramfs_tmpfile, }; /* diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index 0c1c847f992f..fd58618da360 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -49,7 +49,8 @@ static inline int reiserfs_acl_count(size_t size) #ifdef CONFIG_REISERFS_FS_POSIX_ACL struct posix_acl *reiserfs_get_acl(struct inode *inode, int type); -int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); int reiserfs_acl_chmod(struct inode *inode); int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index c76d563dec0e..780bb90c1804 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3282,13 +3282,14 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return ret; } -int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) +int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -3413,7 +3414,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) } if (!error) { - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index adb21bea3d60..4f1cbd930179 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -59,7 +59,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) break; - if (!inode_owner_or_capable(inode)) { + if (!inode_owner_or_capable(&init_user_ns, inode)) { err = -EPERM; goto setflags_out; } @@ -101,7 +101,7 @@ setflags_out: err = put_user(inode->i_generation, (int __user *)arg); break; case REISERFS_IOC_SETVERSION: - if (!inode_owner_or_capable(inode)) { + if (!inode_owner_or_capable(&init_user_ns, inode)) { err = -EPERM; break; } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 1594687582f0..e6eb05e2b2f1 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -615,12 +615,12 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) * the quota init calls have to know who to charge the quota to, so * we have to set uid and gid here */ - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); return dquot_initialize(inode); } -static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { int retval; struct inode *inode; @@ -698,8 +698,8 @@ out_failed: return retval; } -static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) +static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { int retval; struct inode *inode; @@ -781,7 +781,8 @@ out_failed: return retval; } -static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int retval; struct inode *inode; @@ -1094,8 +1095,9 @@ out_unlink: return retval; } -static int reiserfs_symlink(struct inode *parent_dir, - struct dentry *dentry, const char *symname) +static int reiserfs_symlink(struct user_namespace *mnt_userns, + struct inode *parent_dir, struct dentry *dentry, + const char *symname) { int retval; struct inode *inode; @@ -1304,7 +1306,8 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de, * one path. If it holds 2 or more, it can get into endless waiting in * get_empty_nodes or its clones */ -static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, +static int reiserfs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index f69871516167..0ca2ac62e534 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -3102,7 +3102,8 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, } void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); -int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); +int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index fe63a7c3e0da..bd073836e141 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -66,14 +66,14 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!inode_is_locked(dir)); - return dir->i_op->create(dir, dentry, mode, true); + return dir->i_op->create(&init_user_ns, dir, dentry, mode, true); } #endif static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { BUG_ON(!inode_is_locked(dir)); - return dir->i_op->mkdir(dir, dentry, mode); + return dir->i_op->mkdir(&init_user_ns, dir, dentry, mode); } /* @@ -352,7 +352,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data) * ATTR_MODE is set. */ attrs->ia_valid &= (ATTR_UID|ATTR_GID); - err = reiserfs_setattr(dentry, attrs); + err = reiserfs_setattr(&init_user_ns, dentry, attrs); attrs->ia_valid = ia_valid; return err; @@ -604,7 +604,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR); inode_dio_wait(d_inode(dentry)); - err = reiserfs_setattr(dentry, &newattrs); + err = reiserfs_setattr(&init_user_ns, dentry, &newattrs); inode_unlock(d_inode(dentry)); } else update_ctime(inode); @@ -948,7 +948,8 @@ static int xattr_mount_check(struct super_block *s) return 0; } -int reiserfs_permission(struct inode *inode, int mask) +int reiserfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { /* * We don't do permission checks on the internal objects. @@ -957,7 +958,7 @@ int reiserfs_permission(struct inode *inode, int mask) if (IS_PRIVATE(inode)) return 0; - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags) diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index c764352447ba..9b3b06da568c 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -16,7 +16,8 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags); int reiserfs_lookup_privroot(struct super_block *sb); int reiserfs_delete_xattrs(struct inode *inode); int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); -int reiserfs_permission(struct inode *inode, int mask); +int reiserfs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); #ifdef CONFIG_REISERFS_FS_XATTR #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index ccd40df6eb45..a9547144a099 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -18,7 +18,8 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, int -reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int error, error2; struct reiserfs_transaction_handle th; @@ -40,7 +41,8 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) reiserfs_write_unlock(inode->i_sb); if (error == 0) { if (type == ACL_TYPE_ACCESS && acl) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = posix_acl_update_mode(&init_user_ns, inode, + &mode, &acl); if (error) goto unlock; update_mode = 1; @@ -399,5 +401,5 @@ int reiserfs_acl_chmod(struct inode *inode) !reiserfs_posixacl(inode->i_sb)) return 0; - return posix_acl_chmod(inode, inode->i_mode); + return posix_acl_chmod(&init_user_ns, inode, inode->i_mode); } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 20be9a0e5870..8965c8e5e172 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -21,7 +21,8 @@ security_get(const struct xattr_handler *handler, struct dentry *unused, } static int -security_set(const struct xattr_handler *handler, struct dentry *unused, +security_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 5ed48da3d02b..d853cea2afcd 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -20,7 +20,8 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused, } static int -trusted_set(const struct xattr_handler *handler, struct dentry *unused, +trusted_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index a573ca45bacc..65d9cd10a5ea 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -18,7 +18,8 @@ user_get(const struct xattr_handler *handler, struct dentry *unused, } static int -user_set(const struct xattr_handler *handler, struct dentry *unused, +user_set(const struct xattr_handler *handler, struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/remap_range.c b/fs/remap_range.c index 77dba3a49e65..e4a5fdd7ad7b 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -432,13 +432,16 @@ EXPORT_SYMBOL(vfs_clone_file_range); /* Check whether we are allowed to dedupe the destination file */ static bool allow_file_dedupe(struct file *file) { + struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct inode *inode = file_inode(file); + if (capable(CAP_SYS_ADMIN)) return true; if (file->f_mode & FMODE_WRITE) return true; - if (uid_eq(current_fsuid(), file_inode(file)->i_uid)) + if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) return true; - if (!inode_permission(file_inode(file), MAY_WRITE)) + if (!inode_permission(mnt_userns, inode, MAY_WRITE)) return true; return false; } diff --git a/fs/stat.c b/fs/stat.c index dacecdda2e79..fbc171d038aa 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -26,21 +26,29 @@ /** * generic_fillattr - Fill in the basic attributes from the inode struct - * @inode: Inode to use as the source - * @stat: Where to fill in the attributes + * @mnt_userns: user namespace of the mount the inode was found from + * @inode: Inode to use as the source + * @stat: Where to fill in the attributes * * Fill in the basic attributes in the kstat structure from data that's to be * found on the VFS inode structure. This is the default if no getattr inode * operation is supplied. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before filling in the + * uid and gid filds. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. */ -void generic_fillattr(struct inode *inode, struct kstat *stat) +void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, + struct kstat *stat) { stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = inode->i_uid; - stat->gid = inode->i_gid; + stat->uid = i_uid_into_mnt(mnt_userns, inode); + stat->gid = i_gid_into_mnt(mnt_userns, inode); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode->i_atime; @@ -67,6 +75,7 @@ EXPORT_SYMBOL(generic_fillattr); int vfs_getattr_nosec(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { + struct user_namespace *mnt_userns; struct inode *inode = d_backing_inode(path->dentry); memset(stat, 0, sizeof(*stat)); @@ -83,11 +92,12 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, if (IS_DAX(inode)) stat->attributes |= STATX_ATTR_DAX; + mnt_userns = mnt_user_ns(path->mnt); if (inode->i_op->getattr) - return inode->i_op->getattr(path, stat, request_mask, - query_flags); + return inode->i_op->getattr(mnt_userns, path, stat, + request_mask, query_flags); - generic_fillattr(inode, stat); + generic_fillattr(mnt_userns, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 45fc79a18594..90e00124ea07 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -29,12 +29,13 @@ const struct file_operations sysv_file_operations = { .splice_read = generic_file_splice_read, }; -static int sysv_setattr(struct dentry *dentry, struct iattr *attr) +static int sysv_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -47,7 +48,7 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr) sysv_truncate(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 6c9801986af6..50df794a3c1f 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -163,7 +163,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); dirty_sb(sb); - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index bcb67b0cabe7..8b2e99b7bc9f 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -441,11 +441,11 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) return blocks; } -int sysv_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *s = path->dentry->d_sb; - generic_fillattr(d_inode(path->dentry), stat); + generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index ea2414b385ec..b2e6abc06a2d 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -41,7 +41,8 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un return d_splice_alias(inode, dentry); } -static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev) +static int sysv_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; int err; @@ -60,13 +61,14 @@ static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, return err; } -static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) +static int sysv_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return sysv_mknod(dir, dentry, mode, 0); + return sysv_mknod(&init_user_ns, dir, dentry, mode, 0); } -static int sysv_symlink(struct inode * dir, struct dentry * dentry, - const char * symname) +static int sysv_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { int err = -ENAMETOOLONG; int l = strlen(symname)+1; @@ -108,7 +110,8 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode) +static int sysv_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode * inode; int err; @@ -186,9 +189,9 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, - struct inode * new_dir, struct dentry * new_dentry, - unsigned int flags) +static int sysv_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 1cff585526b1..99ddf033da4f 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -141,7 +141,8 @@ extern struct inode *sysv_iget(struct super_block *, unsigned int); extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); extern int sysv_sync_inode(struct inode *); extern void sysv_set_inode(struct inode *, dev_t); -extern int sysv_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int sysv_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); extern int sysv_init_icache(void); extern void sysv_destroy_icache(void); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 0ee8c6dfb036..4b83cbded559 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -67,7 +67,9 @@ static char *get_dname(struct dentry *dentry) return name; } -static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode) +static int tracefs_syscall_mkdir(struct user_namespace *mnt_userns, + struct inode *inode, struct dentry *dentry, + umode_t mode) { char *name; int ret; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 9a6b8660425a..d9d8d7794eff 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -94,7 +94,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, */ inode->i_flags |= S_NOCMTIME; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mapping->nrpages = 0; @@ -280,8 +280,8 @@ static int ubifs_prepare_create(struct inode *dir, struct dentry *dentry, return fscrypt_setup_filename(dir, &dentry->d_name, 0, nm); } -static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; @@ -441,8 +441,8 @@ out_budg: return err; } -static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode) +static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { return do_tmpfile(dir, dentry, mode, NULL); } @@ -942,7 +942,8 @@ out_fname: return err; } -static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_inode *dir_ui = ubifs_inode(dir); @@ -1013,8 +1014,8 @@ out_budg: return err; } -static int ubifs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; struct ubifs_inode *ui; @@ -1102,8 +1103,8 @@ out_budg: return err; } -static int ubifs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct inode *inode; struct ubifs_inode *ui; @@ -1542,7 +1543,8 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, return err; } -static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, +static int ubifs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -1566,8 +1568,8 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -int ubifs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { loff_t size; struct inode *inode = d_inode(path->dentry); @@ -1589,7 +1591,7 @@ int ubifs_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->blksize = UBIFS_BLOCK_SIZE; stat->size = ui->ui_size; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 2bc7780d2963..0e4b4be3aa26 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1257,7 +1257,8 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, return err; } -int ubifs_setattr(struct dentry *dentry, struct iattr *attr) +int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { int err; struct inode *inode = d_inode(dentry); @@ -1265,7 +1266,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) dbg_gen("ino %lu, mode %#x, ia_valid %#x", inode->i_ino, inode->i_mode, attr->ia_valid); - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 4363d85a3fd4..2326d5122beb 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -155,7 +155,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (IS_RDONLY(inode)) return -EROFS; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index fc2cdde3b549..7fdfdbda4b8a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1989,13 +1989,14 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); /* file.c */ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); -int ubifs_setattr(struct dentry *dentry, struct iattr *attr); +int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode); -int ubifs_getattr(const struct path *path, struct kstat *stat, +int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 842d5f14545d..6b1e9830b274 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -681,6 +681,7 @@ static int xattr_get(const struct xattr_handler *handler, } static int xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/udf/file.c b/fs/udf/file.c index ad8eefad27d7..2846dcd92197 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -183,7 +183,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) long old_block, new_block; int result; - if (inode_permission(inode, MAY_READ) != 0) { + if (file_permission(filp, MAY_READ) != 0) { udf_debug("no permission to access inode %lu\n", inode->i_ino); return -EPERM; } @@ -253,13 +253,14 @@ const struct file_operations udf_file_operations = { .llseek = generic_file_llseek, }; -static int udf_setattr(struct dentry *dentry, struct iattr *attr) +static int udf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -282,7 +283,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) udf_update_extra_perms(inode, attr->ia_mode); - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 84ed23edebfd..2ecf0e87660e 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -103,7 +103,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) mutex_unlock(&sbi->s_alloc_mutex); } - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) inode->i_uid = sbi->s_uid; if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) diff --git a/fs/udf/namei.c b/fs/udf/namei.c index e169d8fe35b5..f146b3089f3d 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -604,8 +604,8 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) return 0; } -static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode = udf_new_inode(dir, mode); @@ -623,7 +623,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, return udf_add_nondir(dentry, inode); } -static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode = udf_new_inode(dir, mode); @@ -642,8 +643,8 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } -static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) +static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -658,7 +659,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return udf_add_nondir(dentry, inode); } -static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int udf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct udf_fileident_bh fibh; @@ -877,8 +879,8 @@ out: return retval; } -static int udf_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777); struct pathComponent *pc; @@ -1065,9 +1067,9 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, /* Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index c973db239604..9b223421a3c5 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -152,14 +152,15 @@ out_unmap: return err; } -static int udf_symlink_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +static int udf_symlink_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct inode *inode = d_backing_inode(dentry); struct page *page; - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); page = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(page)) return PTR_ERR(page); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 969fd60436d3..7e3e08c0166f 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -289,7 +289,7 @@ cg_found: ufs_mark_sb_dirty(sb); inode->i_ino = cg * uspi->s_ipg + bit; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index c843ec858cf7..debc282c1bb4 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1211,13 +1211,14 @@ out: return err; } -int ufs_setattr(struct dentry *dentry, struct iattr *attr) +int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -1227,7 +1228,7 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr) return error; } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 9ef40f100415..29d5a0e0c8f0 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -69,7 +69,8 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, +static int ufs_create (struct user_namespace * mnt_userns, + struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { struct inode *inode; @@ -85,7 +86,8 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, return ufs_add_nondir(dentry, inode); } -static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +static int ufs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; int err; @@ -104,8 +106,8 @@ static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev return err; } -static int ufs_symlink (struct inode * dir, struct dentry * dentry, - const char * symname) +static int ufs_symlink (struct user_namespace * mnt_userns, struct inode * dir, + struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; int err; @@ -164,7 +166,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return error; } -static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) +static int ufs_mkdir(struct user_namespace * mnt_userns, struct inode * dir, + struct dentry * dentry, umode_t mode) { struct inode * inode; int err; @@ -240,9 +243,9 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) return err; } -static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int ufs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index b49e0efdf3d7..550f7c5a3636 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -123,7 +123,8 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long); extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_evict_inode (struct inode *); -extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); +extern int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); /* namei.c */ extern const struct file_operations ufs_dir_operations; diff --git a/fs/utimes.c b/fs/utimes.c index fd3cc4226224..39f356017635 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -62,7 +62,8 @@ int vfs_utimes(const struct path *path, struct timespec64 *times) } retry_deleg: inode_lock(inode); - error = notify_change(path->dentry, &newattrs, &delegated_inode); + error = notify_change(mnt_user_ns(path->mnt), path->dentry, &newattrs, + &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index 4d569f14a8d8..7aee0ec63ade 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -288,13 +288,15 @@ static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry, return 0; } -static int vboxsf_dir_mkfile(struct inode *parent, struct dentry *dentry, +static int vboxsf_dir_mkfile(struct user_namespace *mnt_userns, + struct inode *parent, struct dentry *dentry, umode_t mode, bool excl) { return vboxsf_dir_create(parent, dentry, mode, 0); } -static int vboxsf_dir_mkdir(struct inode *parent, struct dentry *dentry, +static int vboxsf_dir_mkdir(struct user_namespace *mnt_userns, + struct inode *parent, struct dentry *dentry, umode_t mode) { return vboxsf_dir_create(parent, dentry, mode, 1); @@ -332,7 +334,8 @@ static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry) return 0; } -static int vboxsf_dir_rename(struct inode *old_parent, +static int vboxsf_dir_rename(struct user_namespace *mnt_userns, + struct inode *old_parent, struct dentry *old_dentry, struct inode *new_parent, struct dentry *new_dentry, @@ -374,7 +377,8 @@ err_put_old_path: return err; } -static int vboxsf_dir_symlink(struct inode *parent, struct dentry *dentry, +static int vboxsf_dir_symlink(struct user_namespace *mnt_userns, + struct inode *parent, struct dentry *dentry, const char *symname) { struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent); diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index 018057546067..3b847e3fba24 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -212,8 +212,8 @@ int vboxsf_inode_revalidate(struct dentry *dentry) return 0; } -int vboxsf_getattr(const struct path *path, struct kstat *kstat, - u32 request_mask, unsigned int flags) +int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *kstat, u32 request_mask, unsigned int flags) { int err; struct dentry *dentry = path->dentry; @@ -233,11 +233,12 @@ int vboxsf_getattr(const struct path *path, struct kstat *kstat, if (err) return err; - generic_fillattr(d_inode(dentry), kstat); + generic_fillattr(&init_user_ns, d_inode(dentry), kstat); return 0; } -int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr) +int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct vboxsf_inode *sf_i = VBOXSF_I(d_inode(dentry)); struct vboxsf_sbi *sbi = VBOXSF_SBI(dentry->d_sb); diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h index 18f95b00fc33..760524e78c88 100644 --- a/fs/vboxsf/vfsmod.h +++ b/fs/vboxsf/vfsmod.h @@ -90,9 +90,11 @@ int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path, struct shfl_fsobjinfo *info); int vboxsf_stat_dentry(struct dentry *dentry, struct shfl_fsobjinfo *info); int vboxsf_inode_revalidate(struct dentry *dentry); -int vboxsf_getattr(const struct path *path, struct kstat *kstat, - u32 request_mask, unsigned int query_flags); -int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr); +int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *kstat, u32 request_mask, + unsigned int query_flags); +int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr); struct shfl_string *vboxsf_path_from_dentry(struct vboxsf_sbi *sbi, struct dentry *dentry); int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len, diff --git a/fs/verity/enable.c b/fs/verity/enable.c index f7e997a01ad0..77e159a0346b 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -369,7 +369,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) * has verity enabled, and to stabilize the data being hashed. */ - err = inode_permission(inode, MAY_WRITE); + err = file_permission(filp, MAY_WRITE); if (err) return err; diff --git a/fs/xattr.c b/fs/xattr.c index fd57153b1f61..b3444e06cded 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -83,7 +83,8 @@ xattr_resolve_name(struct inode *inode, const char **name) * because different namespaces have very different rules. */ static int -xattr_permission(struct inode *inode, const char *name, int mask) +xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, + const char *name, int mask) { /* * We can never set or remove an extended attribute on a read-only @@ -97,7 +98,7 @@ xattr_permission(struct inode *inode, const char *name, int mask) * to be writen back improperly if their true value is * unknown to the vfs. */ - if (HAS_UNMAPPED_ID(inode)) + if (HAS_UNMAPPED_ID(mnt_userns, inode)) return -EPERM; } @@ -127,11 +128,12 @@ xattr_permission(struct inode *inode, const char *name, int mask) if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && - (mask & MAY_WRITE) && !inode_owner_or_capable(inode)) + (mask & MAY_WRITE) && + !inode_owner_or_capable(mnt_userns, inode)) return -EPERM; } - return inode_permission(inode, mask); + return inode_permission(mnt_userns, inode, mask); } /* @@ -162,8 +164,9 @@ xattr_supported_namespace(struct inode *inode, const char *prefix) EXPORT_SYMBOL(xattr_supported_namespace); int -__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, - const void *value, size_t size, int flags) +__vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct inode *inode, const char *name, const void *value, + size_t size, int flags) { const struct xattr_handler *handler; @@ -174,7 +177,8 @@ __vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ - return handler->set(handler, dentry, inode, name, value, size, flags); + return handler->set(handler, mnt_userns, dentry, inode, name, value, + size, flags); } EXPORT_SYMBOL(__vfs_setxattr); @@ -182,6 +186,7 @@ EXPORT_SYMBOL(__vfs_setxattr); * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * + * @mnt_userns - user namespace of the mount the inode was found from * @dentry - object to perform setxattr on * @name - xattr name to set * @value - value to set @name to @@ -194,8 +199,9 @@ EXPORT_SYMBOL(__vfs_setxattr); * is executed. It also assumes that the caller will make the appropriate * permission checks. */ -int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; int error = -EAGAIN; @@ -205,7 +211,8 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { - error = __vfs_setxattr(dentry, inode, name, value, size, flags); + error = __vfs_setxattr(mnt_userns, dentry, inode, name, value, + size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, @@ -244,18 +251,19 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, * a delegation was broken on, NULL if none. */ int -__vfs_setxattr_locked(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, - struct inode **delegated_inode) +__vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name, const void *value, size_t size, + int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(inode, name, MAY_WRITE); + error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); if (error) return error; - error = security_inode_setxattr(dentry, name, value, size, flags); + error = security_inode_setxattr(mnt_userns, dentry, name, value, size, + flags); if (error) goto out; @@ -263,7 +271,8 @@ __vfs_setxattr_locked(struct dentry *dentry, const char *name, if (error) goto out; - error = __vfs_setxattr_noperm(dentry, name, value, size, flags); + error = __vfs_setxattr_noperm(mnt_userns, dentry, name, value, + size, flags); out: return error; @@ -271,8 +280,8 @@ out: EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int -vfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -280,7 +289,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(dentry, &value, size); + error = cap_convert_nscap(mnt_userns, dentry, &value, size); if (error < 0) return error; size = error; @@ -288,8 +297,8 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, retry_deleg: inode_lock(inode); - error = __vfs_setxattr_locked(dentry, name, value, size, flags, - &delegated_inode); + error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, + flags, &delegated_inode); inode_unlock(inode); if (delegated_inode) { @@ -305,18 +314,20 @@ retry_deleg: EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t -xattr_getsecurity(struct inode *inode, const char *name, void *value, - size_t size) +xattr_getsecurity(struct user_namespace *mnt_userns, struct inode *inode, + const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { - len = security_inode_getsecurity(inode, name, &buffer, false); + len = security_inode_getsecurity(mnt_userns, inode, name, + &buffer, false); goto out_noalloc; } - len = security_inode_getsecurity(inode, name, &buffer, true); + len = security_inode_getsecurity(mnt_userns, inode, name, &buffer, + true); if (len < 0) return len; if (size < len) { @@ -339,15 +350,16 @@ out_noalloc: * Returns the result of alloc, if failed, or the getxattr operation. */ ssize_t -vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, - size_t xattr_size, gfp_t flags) +vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name, char **xattr_value, size_t xattr_size, + gfp_t flags) { const struct xattr_handler *handler; struct inode *inode = dentry->d_inode; char *value = *xattr_value; int error; - error = xattr_permission(inode, name, MAY_READ); + error = xattr_permission(mnt_userns, inode, name, MAY_READ); if (error) return error; @@ -388,12 +400,13 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, EXPORT_SYMBOL(__vfs_getxattr); ssize_t -vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) +vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(inode, name, MAY_READ); + error = xattr_permission(mnt_userns, inode, name, MAY_READ); if (error) return error; @@ -404,7 +417,8 @@ vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - int ret = xattr_getsecurity(inode, suffix, value, size); + int ret = xattr_getsecurity(mnt_userns, inode, suffix, value, + size); /* * Only overwrite the return value if a security module * is actually active. @@ -439,7 +453,8 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size) EXPORT_SYMBOL_GPL(vfs_listxattr); int -__vfs_removexattr(struct dentry *dentry, const char *name) +__vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name) { struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; @@ -449,7 +464,8 @@ __vfs_removexattr(struct dentry *dentry, const char *name) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; - return handler->set(handler, dentry, inode, name, NULL, 0, XATTR_REPLACE); + return handler->set(handler, mnt_userns, dentry, inode, name, NULL, 0, + XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); @@ -463,17 +479,18 @@ EXPORT_SYMBOL(__vfs_removexattr); * a delegation was broken on, NULL if none. */ int -__vfs_removexattr_locked(struct dentry *dentry, const char *name, - struct inode **delegated_inode) +__vfs_removexattr_locked(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *name, + struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(inode, name, MAY_WRITE); + error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); if (error) return error; - error = security_inode_removexattr(dentry, name); + error = security_inode_removexattr(mnt_userns, dentry, name); if (error) goto out; @@ -481,7 +498,7 @@ __vfs_removexattr_locked(struct dentry *dentry, const char *name, if (error) goto out; - error = __vfs_removexattr(dentry, name); + error = __vfs_removexattr(mnt_userns, dentry, name); if (!error) { fsnotify_xattr(dentry); @@ -494,7 +511,8 @@ out: EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int -vfs_removexattr(struct dentry *dentry, const char *name) +vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -502,7 +520,8 @@ vfs_removexattr(struct dentry *dentry, const char *name) retry_deleg: inode_lock(inode); - error = __vfs_removexattr_locked(dentry, name, &delegated_inode); + error = __vfs_removexattr_locked(mnt_userns, dentry, + name, &delegated_inode); inode_unlock(inode); if (delegated_inode) { @@ -519,8 +538,9 @@ EXPORT_SYMBOL_GPL(vfs_removexattr); * Extended attribute SET operations */ static long -setxattr(struct dentry *d, const char __user *name, const void __user *value, - size_t size, int flags) +setxattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name, const void __user *value, size_t size, + int flags) { int error; void *kvalue = NULL; @@ -547,10 +567,10 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, } if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_from_user(kvalue, size); + posix_acl_fix_xattr_from_user(mnt_userns, kvalue, size); } - error = vfs_setxattr(d, kname, kvalue, size, flags); + error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags); out: kvfree(kvalue); @@ -563,13 +583,15 @@ static int path_setxattr(const char __user *pathname, { struct path path; int error; + retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); if (!error) { - error = setxattr(path.dentry, name, value, size, flags); + error = setxattr(mnt_user_ns(path.mnt), path.dentry, name, + value, size, flags); mnt_drop_write(path.mnt); } path_put(&path); @@ -605,7 +627,9 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = setxattr(f.file->f_path.dentry, name, value, size, flags); + error = setxattr(file_mnt_user_ns(f.file), + f.file->f_path.dentry, name, + value, size, flags); mnt_drop_write_file(f.file); } fdput(f); @@ -616,8 +640,8 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, * Extended attribute GET operations */ static ssize_t -getxattr(struct dentry *d, const char __user *name, void __user *value, - size_t size) +getxattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name, void __user *value, size_t size) { ssize_t error; void *kvalue = NULL; @@ -637,11 +661,11 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, return -ENOMEM; } - error = vfs_getxattr(d, kname, kvalue, size); + error = vfs_getxattr(mnt_userns, d, kname, kvalue, size); if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(kvalue, error); + posix_acl_fix_xattr_to_user(mnt_userns, kvalue, error); if (size && copy_to_user(value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { @@ -665,7 +689,7 @@ retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; - error = getxattr(path.dentry, name, value, size); + error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -695,7 +719,8 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, if (!f.file) return error; audit_file(f.file); - error = getxattr(f.file->f_path.dentry, name, value, size); + error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry, + name, value, size); fdput(f); return error; } @@ -779,7 +804,8 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) * Extended attribute REMOVE operations */ static long -removexattr(struct dentry *d, const char __user *name) +removexattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name) { int error; char kname[XATTR_NAME_MAX + 1]; @@ -790,7 +816,7 @@ removexattr(struct dentry *d, const char __user *name) if (error < 0) return error; - return vfs_removexattr(d, kname); + return vfs_removexattr(mnt_userns, d, kname); } static int path_removexattr(const char __user *pathname, @@ -804,7 +830,7 @@ retry: return error; error = mnt_want_write(path.mnt); if (!error) { - error = removexattr(path.dentry, name); + error = removexattr(mnt_user_ns(path.mnt), path.dentry, name); mnt_drop_write(path.mnt); } path_put(&path); @@ -837,7 +863,8 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = removexattr(f.file->f_path.dentry, name); + error = removexattr(file_mnt_user_ns(f.file), + f.file->f_path.dentry, name); mnt_drop_write_file(f.file); } fdput(f); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index b56ff451adce..5b6fcb9b44e2 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2805,7 +2805,7 @@ xfs_btree_split_worker( struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; - unsigned long new_pflags = PF_MEMALLOC_NOFS; + unsigned long new_pflags = 0; /* * we are in a transaction context here, but may also be doing work @@ -2817,12 +2817,20 @@ xfs_btree_split_worker( new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; current_set_flags_nested(&pflags, new_pflags); + xfs_trans_set_context(args->cur->bc_tp); args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, args->key, args->curp, args->stat); - complete(args->done); + xfs_trans_clear_context(args->cur->bc_tp); current_restore_flags_nested(&pflags, new_pflags); + + /* + * Do not access args after complete() has run here. We don't own args + * and the owner may run and free args before we return here. + */ + complete(args->done); + } /* diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 779cb73b3d00..d02bef24b32b 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -238,7 +238,8 @@ xfs_acl_set_mode( } int -xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { umode_t mode; bool set_mode = false; @@ -252,7 +253,7 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); if (error) return error; set_mode = true; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index c042c0868016..7bdb3a4ed798 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -11,7 +11,8 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); -extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); #else diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4304c6416fbb..b4186d666157 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -62,7 +62,7 @@ xfs_setfilesize_trans_alloc( * We hand off the transaction to the completion thread now, so * clear the flag here. */ - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); + xfs_trans_clear_context(tp); return 0; } @@ -125,7 +125,7 @@ xfs_setfilesize_ioend( * thus we need to mark ourselves as being in a transaction manually. * Similarly for freeze protection. */ - current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); + xfs_trans_set_context(tp); __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ @@ -568,6 +568,12 @@ xfs_vm_writepage( { struct xfs_writepage_ctx wpc = { }; + if (WARN_ON_ONCE(current->journal_info)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops); } @@ -578,6 +584,13 @@ xfs_vm_writepages( { struct xfs_writepage_ctx wpc = { }; + /* + * Writing back data in a transaction context can result in recursive + * transactions. This is bad, so issue a warning and get out of here. + */ + if (WARN_ON_ONCE(current->journal_info)) + return 0; + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index e2148f2d5d6b..17f36db2f792 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -6,7 +6,7 @@ static inline unsigned int bio_max_vecs(unsigned int count) { - return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES); + return bio_max_segs(howmany(count, PAGE_SIZE)); } int diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f6e5235df7c9..37a1d12762d8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1480,7 +1480,7 @@ xfs_buf_ioapply_map( int op) { int page_index; - int total_nr_pages = bp->b_page_count; + unsigned int total_nr_pages = bp->b_page_count; int nr_pages; struct bio *bio; sector_t sector = bp->b_maps[map].bm_bn; @@ -1505,7 +1505,7 @@ xfs_buf_ioapply_map( next_chunk: atomic_inc(&bp->b_io_remaining); - nr_pages = min(total_nr_pages, BIO_MAX_PAGES); + nr_pages = bio_max_segs(total_nr_pages); bio = bio_alloc(GFP_NOIO, nr_pages); bio_set_dev(bio, bp->b_target->bt_bdev); diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 3991e59cfd18..ef17c1f6db32 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -344,7 +344,6 @@ xfs_extent_busy_trim( ASSERT(*len > 0); spin_lock(&args->pag->pagb_lock); -restart: fbno = *bno; flen = *len; rbp = args->pag->pagb_tree.rb_node; @@ -363,19 +362,6 @@ restart: continue; } - /* - * If this is a metadata allocation, try to reuse the busy - * extent instead of trimming the allocation. - */ - if (!(args->datatype & XFS_ALLOC_USERDATA) && - !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { - if (!xfs_extent_busy_update_extent(args->mp, args->pag, - busyp, fbno, flen, - false)) - goto restart; - continue; - } - if (bbno <= fbno) { /* start overlap */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 68ca1b40d8c7..a007ca0711d9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -29,6 +29,7 @@ #include <linux/backing-dev.h> #include <linux/mman.h> #include <linux/fadvise.h> +#include <linux/mount.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -1051,7 +1052,8 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = xfs_vn_setattr_size(file_dentry(file), &iattr); + error = xfs_vn_setattr_size(file_mnt_user_ns(file), + file_dentry(file), &iattr); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 636ac13b1df2..46a861d55e48 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -766,6 +766,7 @@ xfs_inode_inherit_flags2( */ static int xfs_init_new_inode( + struct user_namespace *mnt_userns, struct xfs_trans *tp, struct xfs_inode *pip, xfs_ino_t ino, @@ -811,11 +812,11 @@ xfs_init_new_inode( if (dir && !(dir->i_mode & S_ISGID) && (mp->m_flags & XFS_MOUNT_GRPID)) { - inode->i_uid = current_fsuid(); + inode->i_uid = fsuid_into_mnt(mnt_userns); inode->i_gid = dir->i_gid; inode->i_mode = mode; } else { - inode_init_owner(inode, dir, mode); + inode_init_owner(mnt_userns, inode, dir, mode); } /* @@ -824,7 +825,8 @@ xfs_init_new_inode( * (and only if the irix_sgid_inherit compatibility variable is set). */ if (irix_sgid_inherit && - (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid)) + (inode->i_mode & S_ISGID) && + !in_group_p(i_gid_into_mnt(mnt_userns, inode))) inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; @@ -901,6 +903,7 @@ xfs_init_new_inode( */ int xfs_dir_ialloc( + struct user_namespace *mnt_userns, struct xfs_trans **tpp, struct xfs_inode *dp, umode_t mode, @@ -933,7 +936,8 @@ xfs_dir_ialloc( return error; ASSERT(ino != NULLFSINO); - return xfs_init_new_inode(*tpp, dp, ino, mode, nlink, rdev, prid, ipp); + return xfs_init_new_inode(mnt_userns, *tpp, dp, ino, mode, nlink, rdev, + prid, ipp); } /* @@ -973,6 +977,7 @@ xfs_bumplink( int xfs_create( + struct user_namespace *mnt_userns, xfs_inode_t *dp, struct xfs_name *name, umode_t mode, @@ -1046,7 +1051,8 @@ xfs_create( * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip); + error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, is_dir ? 2 : 1, rdev, + prid, &ip); if (error) goto out_trans_cancel; @@ -1127,6 +1133,7 @@ xfs_create( int xfs_create_tmpfile( + struct user_namespace *mnt_userns, struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp) @@ -1164,7 +1171,7 @@ xfs_create_tmpfile( if (error) goto out_release_dquots; - error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); + error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid, &ip); if (error) goto out_trans_cancel; @@ -2977,13 +2984,15 @@ out_trans_abort: */ static int xfs_rename_alloc_whiteout( + struct user_namespace *mnt_userns, struct xfs_inode *dp, struct xfs_inode **wip) { struct xfs_inode *tmpfile; int error; - error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile); + error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE, + &tmpfile); if (error) return error; @@ -3005,6 +3014,7 @@ xfs_rename_alloc_whiteout( */ int xfs_rename( + struct user_namespace *mnt_userns, struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, @@ -3036,7 +3046,7 @@ xfs_rename( */ if (flags & RENAME_WHITEOUT) { ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); - error = xfs_rename_alloc_whiteout(target_dp, &wip); + error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index eca333f5f715..88ee4c3930ae 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -369,15 +369,18 @@ int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); -int xfs_create(struct xfs_inode *dp, struct xfs_name *name, +int xfs_create(struct user_namespace *mnt_userns, + struct xfs_inode *dp, struct xfs_name *name, umode_t mode, dev_t rdev, struct xfs_inode **ipp); -int xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode, +int xfs_create_tmpfile(struct user_namespace *mnt_userns, + struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); -int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, +int xfs_rename(struct user_namespace *mnt_userns, + struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, struct xfs_inode *target_ip, unsigned int flags); @@ -407,9 +410,10 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); -int xfs_dir_ialloc(struct xfs_trans **tpp, struct xfs_inode *dp, umode_t mode, - xfs_nlink_t nlink, dev_t dev, prid_t prid, - struct xfs_inode **ipp); +int xfs_dir_ialloc(struct user_namespace *mnt_userns, + struct xfs_trans **tpp, struct xfs_inode *dp, + umode_t mode, xfs_nlink_t nlink, dev_t dev, + prid_t prid, struct xfs_inode **ipp); static inline int xfs_itruncate_extents( diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 248083ea0276..99dfe89a8d08 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -693,7 +693,8 @@ xfs_ioc_space( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = bf->l_start; - error = xfs_vn_setattr_size(file_dentry(filp), &iattr); + error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp), + &iattr); if (error) goto out_unlock; @@ -734,13 +735,15 @@ xfs_fsinumbers_fmt( STATIC int xfs_ioc_fsbulkstat( - xfs_mount_t *mp, + struct file *file, unsigned int cmd, void __user *arg) { + struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; struct xfs_fsop_bulkreq bulkreq; struct xfs_ibulk breq = { .mp = mp, + .mnt_userns = file_mnt_user_ns(file), .ocount = 0, }; xfs_ino_t lastino; @@ -908,13 +911,15 @@ xfs_bulk_ireq_teardown( /* Handle the v5 bulkstat ioctl. */ STATIC int xfs_ioc_bulkstat( - struct xfs_mount *mp, + struct file *file, unsigned int cmd, struct xfs_bulkstat_req __user *arg) { + struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; struct xfs_bulk_ireq hdr; struct xfs_ibulk breq = { .mp = mp, + .mnt_userns = file_mnt_user_ns(file), }; int error; @@ -1275,9 +1280,10 @@ xfs_ioctl_setattr_prepare_dax( */ static struct xfs_trans * xfs_ioctl_setattr_get_trans( - struct xfs_inode *ip, + struct file *file, struct xfs_dquot *pdqp) { + struct xfs_inode *ip = XFS_I(file_inode(file)); struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = -EROFS; @@ -1299,7 +1305,7 @@ xfs_ioctl_setattr_get_trans( * The user ID of the calling process must be equal to the file owner * ID, except in cases where the CAP_FSETID capability is applicable. */ - if (!inode_owner_or_capable(VFS_I(ip))) { + if (!inode_owner_or_capable(file_mnt_user_ns(file), VFS_I(ip))) { error = -EPERM; goto out_cancel; } @@ -1427,9 +1433,11 @@ xfs_ioctl_setattr_check_projid( STATIC int xfs_ioctl_setattr( - xfs_inode_t *ip, + struct file *file, struct fsxattr *fa) { + struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct xfs_inode *ip = XFS_I(file_inode(file)); struct fsxattr old_fa; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -1461,7 +1469,7 @@ xfs_ioctl_setattr( xfs_ioctl_setattr_prepare_dax(ip, fa); - tp = xfs_ioctl_setattr_get_trans(ip, pdqp); + tp = xfs_ioctl_setattr_get_trans(file, pdqp); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto error_free_dquots; @@ -1493,7 +1501,7 @@ xfs_ioctl_setattr( */ if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) && - !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) + !capable_wrt_inode_uidgid(mnt_userns, VFS_I(ip), CAP_FSETID)) VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ @@ -1540,7 +1548,6 @@ error_free_dquots: STATIC int xfs_ioc_fssetxattr( - xfs_inode_t *ip, struct file *filp, void __user *arg) { @@ -1553,7 +1560,7 @@ xfs_ioc_fssetxattr( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_ioctl_setattr(ip, &fa); + error = xfs_ioctl_setattr(filp, &fa); mnt_drop_write_file(filp); return error; } @@ -1599,7 +1606,7 @@ xfs_ioc_setxflags( xfs_ioctl_setattr_prepare_dax(ip, &fa); - tp = xfs_ioctl_setattr_get_trans(ip, NULL); + tp = xfs_ioctl_setattr_get_trans(filp, NULL); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto out_drop_write; @@ -2110,10 +2117,10 @@ xfs_file_ioctl( case XFS_IOC_FSBULKSTAT_SINGLE: case XFS_IOC_FSBULKSTAT: case XFS_IOC_FSINUMBERS: - return xfs_ioc_fsbulkstat(mp, cmd, arg); + return xfs_ioc_fsbulkstat(filp, cmd, arg); case XFS_IOC_BULKSTAT: - return xfs_ioc_bulkstat(mp, cmd, arg); + return xfs_ioc_bulkstat(filp, cmd, arg); case XFS_IOC_INUMBERS: return xfs_ioc_inumbers(mp, cmd, arg); @@ -2135,7 +2142,7 @@ xfs_file_ioctl( case XFS_IOC_FSGETXATTRA: return xfs_ioc_fsgetxattr(ip, 1, arg); case XFS_IOC_FSSETXATTR: - return xfs_ioc_fssetxattr(ip, filp, arg); + return xfs_ioc_fssetxattr(filp, arg); case XFS_IOC_GETXFLAGS: return xfs_ioc_getxflags(ip, arg); case XFS_IOC_SETXFLAGS: diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c1771e728117..33c09ec8e6c0 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -209,14 +209,16 @@ xfs_fsbulkstat_one_fmt_compat( /* copied from xfs_ioctl.c */ STATIC int xfs_compat_ioc_fsbulkstat( - xfs_mount_t *mp, + struct file *file, unsigned int cmd, struct compat_xfs_fsop_bulkreq __user *p32) { + struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; u32 addr; struct xfs_fsop_bulkreq bulkreq; struct xfs_ibulk breq = { .mp = mp, + .mnt_userns = file_mnt_user_ns(file), .ocount = 0, }; xfs_ino_t lastino; @@ -436,7 +438,6 @@ xfs_file_compat_ioctl( { struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; void __user *arg = compat_ptr(p); int error; @@ -456,7 +457,7 @@ xfs_file_compat_ioctl( return xfs_ioc_space(filp, &bf); } case XFS_IOC_FSGEOMETRY_V1_32: - return xfs_compat_ioc_fsgeometry_v1(mp, arg); + return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg); case XFS_IOC_FSGROWFSDATA_32: { struct xfs_growfs_data in; @@ -465,7 +466,7 @@ xfs_file_compat_ioctl( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_growfs_data(mp, &in); + error = xfs_growfs_data(ip->i_mount, &in); mnt_drop_write_file(filp); return error; } @@ -477,7 +478,7 @@ xfs_file_compat_ioctl( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_growfs_rt(mp, &in); + error = xfs_growfs_rt(ip->i_mount, &in); mnt_drop_write_file(filp); return error; } @@ -507,7 +508,7 @@ xfs_file_compat_ioctl( case XFS_IOC_FSBULKSTAT_32: case XFS_IOC_FSBULKSTAT_SINGLE_32: case XFS_IOC_FSINUMBERS_32: - return xfs_compat_ioc_fsbulkstat(mp, cmd, arg); + return xfs_compat_ioc_fsbulkstat(filp, cmd, arg); case XFS_IOC_FD_TO_HANDLE_32: case XFS_IOC_PATH_TO_HANDLE_32: case XFS_IOC_PATH_TO_FSHANDLE_32: { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 00369502fe25..66ebccb5a6ff 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -128,6 +128,7 @@ xfs_cleanup_inode( STATIC int xfs_generic_create( + struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -161,9 +162,10 @@ xfs_generic_create( goto out_free_acl; if (!tmpfile) { - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + error = xfs_create(mnt_userns, XFS_I(dir), &name, mode, rdev, + &ip); } else { - error = xfs_create_tmpfile(XFS_I(dir), mode, &ip); + error = xfs_create_tmpfile(mnt_userns, XFS_I(dir), mode, &ip); } if (unlikely(error)) goto out_free_acl; @@ -220,31 +222,35 @@ xfs_generic_create( STATIC int xfs_vn_mknod( - struct inode *dir, - struct dentry *dentry, - umode_t mode, - dev_t rdev) + struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) { - return xfs_generic_create(dir, dentry, mode, rdev, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false); } STATIC int xfs_vn_create( - struct inode *dir, - struct dentry *dentry, - umode_t mode, - bool flags) + struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + umode_t mode, + bool flags) { - return xfs_generic_create(dir, dentry, mode, 0, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false); } STATIC int xfs_vn_mkdir( - struct inode *dir, - struct dentry *dentry, - umode_t mode) + struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { - return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0, + false); } STATIC struct dentry * @@ -361,9 +367,10 @@ xfs_vn_unlink( STATIC int xfs_vn_symlink( - struct inode *dir, - struct dentry *dentry, - const char *symname) + struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + const char *symname) { struct inode *inode; struct xfs_inode *cip = NULL; @@ -377,7 +384,7 @@ xfs_vn_symlink( if (unlikely(error)) goto out; - error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); + error = xfs_symlink(mnt_userns, XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) goto out; @@ -403,11 +410,12 @@ xfs_vn_symlink( STATIC int xfs_vn_rename( - struct inode *odir, - struct dentry *odentry, - struct inode *ndir, - struct dentry *ndentry, - unsigned int flags) + struct user_namespace *mnt_userns, + struct inode *odir, + struct dentry *odentry, + struct inode *ndir, + struct dentry *ndentry, + unsigned int flags) { struct inode *new_inode = d_inode(ndentry); int omode = 0; @@ -431,8 +439,8 @@ xfs_vn_rename( if (unlikely(error)) return error; - return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)), - XFS_I(ndir), &nname, + return xfs_rename(mnt_userns, XFS_I(odir), &oname, + XFS_I(d_inode(odentry)), XFS_I(ndir), &nname, new_inode ? XFS_I(new_inode) : NULL, flags); } @@ -529,6 +537,7 @@ xfs_stat_blksize( STATIC int xfs_vn_getattr( + struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, @@ -547,8 +556,8 @@ xfs_vn_getattr( stat->dev = inode->i_sb->s_dev; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = inode->i_uid; - stat->gid = inode->i_gid; + stat->uid = i_uid_into_mnt(mnt_userns, inode); + stat->gid = i_gid_into_mnt(mnt_userns, inode); stat->ino = ip->i_ino; stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; @@ -626,8 +635,9 @@ xfs_setattr_time( static int xfs_vn_change_ok( - struct dentry *dentry, - struct iattr *iattr) + struct user_namespace *mnt_userns, + struct dentry *dentry, + struct iattr *iattr) { struct xfs_mount *mp = XFS_I(d_inode(dentry))->i_mount; @@ -637,7 +647,7 @@ xfs_vn_change_ok( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - return setattr_prepare(dentry, iattr); + return setattr_prepare(mnt_userns, dentry, iattr); } /* @@ -648,6 +658,7 @@ xfs_vn_change_ok( */ static int xfs_setattr_nonsize( + struct user_namespace *mnt_userns, struct xfs_inode *ip, struct iattr *iattr) { @@ -788,7 +799,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if (mask & ATTR_MODE) { - error = posix_acl_chmod(inode, inode->i_mode); + error = posix_acl_chmod(mnt_userns, inode, inode->i_mode); if (error) return error; } @@ -809,6 +820,7 @@ out_dqrele: */ STATIC int xfs_setattr_size( + struct user_namespace *mnt_userns, struct xfs_inode *ip, struct iattr *iattr) { @@ -840,7 +852,7 @@ xfs_setattr_size( * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; - return xfs_setattr_nonsize(ip, iattr); + return xfs_setattr_nonsize(mnt_userns, ip, iattr); } /* @@ -1009,6 +1021,7 @@ out_trans_cancel: int xfs_vn_setattr_size( + struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { @@ -1017,14 +1030,15 @@ xfs_vn_setattr_size( trace_xfs_setattr(ip); - error = xfs_vn_change_ok(dentry, iattr); + error = xfs_vn_change_ok(mnt_userns, dentry, iattr); if (error) return error; - return xfs_setattr_size(ip, iattr); + return xfs_setattr_size(mnt_userns, ip, iattr); } STATIC int xfs_vn_setattr( + struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { @@ -1044,14 +1058,14 @@ xfs_vn_setattr( return error; } - error = xfs_vn_setattr_size(dentry, iattr); + error = xfs_vn_setattr_size(mnt_userns, dentry, iattr); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { trace_xfs_setattr(ip); - error = xfs_vn_change_ok(dentry, iattr); + error = xfs_vn_change_ok(mnt_userns, dentry, iattr); if (!error) - error = xfs_setattr_nonsize(ip, iattr); + error = xfs_setattr_nonsize(mnt_userns, ip, iattr); } return error; @@ -1122,11 +1136,12 @@ xfs_vn_fiemap( STATIC int xfs_vn_tmpfile( - struct inode *dir, - struct dentry *dentry, - umode_t mode) + struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { - return xfs_generic_create(dir, dentry, mode, 0, true); + return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true); } static const struct inode_operations xfs_inode_operations = { diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 99ca745c1071..278949056048 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -14,6 +14,7 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); -extern int xfs_vn_setattr_size(struct dentry *dentry, struct iattr *vap); +int xfs_vn_setattr_size(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *vap); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 16ca97a7ff00..ca310a125d1e 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -54,10 +54,12 @@ struct xfs_bstat_chunk { STATIC int xfs_bulkstat_one_int( struct xfs_mount *mp, + struct user_namespace *mnt_userns, struct xfs_trans *tp, xfs_ino_t ino, struct xfs_bstat_chunk *bc) { + struct user_namespace *sb_userns = mp->m_super->s_user_ns; struct xfs_icdinode *dic; /* dinode core info pointer */ struct xfs_inode *ip; /* incore inode pointer */ struct inode *inode; @@ -86,8 +88,8 @@ xfs_bulkstat_one_int( */ buf->bs_projectid = ip->i_d.di_projid; buf->bs_ino = ino; - buf->bs_uid = i_uid_read(inode); - buf->bs_gid = i_gid_read(inode); + buf->bs_uid = from_kuid(sb_userns, i_uid_into_mnt(mnt_userns, inode)); + buf->bs_gid = from_kgid(sb_userns, i_gid_into_mnt(mnt_userns, inode)); buf->bs_size = dic->di_size; buf->bs_nlink = inode->i_nlink; @@ -173,7 +175,8 @@ xfs_bulkstat_one( if (!bc.buf) return -ENOMEM; - error = xfs_bulkstat_one_int(breq->mp, NULL, breq->startino, &bc); + error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, NULL, + breq->startino, &bc); kmem_free(bc.buf); @@ -194,9 +197,10 @@ xfs_bulkstat_iwalk( xfs_ino_t ino, void *data) { + struct xfs_bstat_chunk *bc = data; int error; - error = xfs_bulkstat_one_int(mp, tp, ino, data); + error = xfs_bulkstat_one_int(mp, bc->breq->mnt_userns, tp, ino, data); /* bulkstat just skips over missing inodes */ if (error == -ENOENT || error == -EINVAL) return 0; @@ -239,6 +243,11 @@ xfs_bulkstat( }; int error; + if (breq->mnt_userns != &init_user_ns) { + xfs_warn_ratelimited(breq->mp, + "bulkstat not supported inside of idmapped mounts."); + return -EINVAL; + } if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 96a1e2a9be3f..7078d10c9b12 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -8,6 +8,7 @@ /* In-memory representation of a userspace request for batch inode data. */ struct xfs_ibulk { struct xfs_mount *mp; + struct user_namespace *mnt_userns; void __user *ubuffer; /* user output buffer */ xfs_ino_t startino; /* start with this inode */ unsigned int icount; /* number of elements in ubuffer */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 742d1413e2d0..bfa4164990b1 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -787,7 +787,8 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ipp); + error = xfs_dir_ialloc(&init_user_ns, &tp, NULL, S_IFREG, 1, 0, + 0, ipp); if (error) { xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 586d42342a79..e5e0713bebcd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1881,7 +1881,7 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("xfs"); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 8565663b16cd..1379013d74b8 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -134,6 +134,7 @@ xfs_readlink( int xfs_symlink( + struct user_namespace *mnt_userns, struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, @@ -221,8 +222,8 @@ xfs_symlink( /* * Allocate an inode for the symlink. */ - error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, &ip); + error = xfs_dir_ialloc(mnt_userns, &tp, dp, S_IFLNK | (mode & ~S_IFMT), + 1, 0, prid, &ip); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index b1fa091427e6..2586b7e393f3 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -7,8 +7,9 @@ /* Kernel only symlink definitions */ -int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, - const char *target_path, umode_t mode, struct xfs_inode **ipp); +int xfs_symlink(struct user_namespace *mnt_userns, struct xfs_inode *dp, + struct xfs_name *link_name, const char *target_path, + umode_t mode, struct xfs_inode **ipp); int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link); int xfs_readlink(struct xfs_inode *ip, char *link); int xfs_inactive_symlink(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 145e06c47744..546a6cd96729 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -51,7 +51,7 @@ xfs_panic_mask_proc_handler( #endif /* CONFIG_PROC_FS */ STATIC int -xfs_deprecate_irix_sgid_inherit_proc_handler( +xfs_deprecated_dointvec_minmax( struct ctl_table *ctl, int write, void *buffer, @@ -59,24 +59,8 @@ xfs_deprecate_irix_sgid_inherit_proc_handler( loff_t *ppos) { if (write) { - printk_once(KERN_WARNING - "XFS: " "%s sysctl option is deprecated.\n", - ctl->procname); - } - return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); -} - -STATIC int -xfs_deprecate_irix_symlink_mode_proc_handler( - struct ctl_table *ctl, - int write, - void *buffer, - size_t *lenp, - loff_t *ppos) -{ - if (write) { - printk_once(KERN_WARNING - "XFS: " "%s sysctl option is deprecated.\n", + printk_ratelimited(KERN_WARNING + "XFS: %s sysctl option is deprecated.\n", ctl->procname); } return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); @@ -88,7 +72,7 @@ static struct ctl_table xfs_table[] = { .data = &xfs_params.sgid_inherit.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler, + .proc_handler = xfs_deprecated_dointvec_minmax, .extra1 = &xfs_params.sgid_inherit.min, .extra2 = &xfs_params.sgid_inherit.max }, @@ -97,7 +81,7 @@ static struct ctl_table xfs_table[] = { .data = &xfs_params.symlink_mode.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler, + .proc_handler = xfs_deprecated_dointvec_minmax, .extra1 = &xfs_params.symlink_mode.min, .extra2 = &xfs_params.symlink_mode.max }, @@ -201,6 +185,15 @@ static struct ctl_table xfs_table[] = { .extra1 = &xfs_params.blockgc_timer.min, .extra2 = &xfs_params.blockgc_timer.max, }, + { + .procname = "speculative_cow_prealloc_lifetime", + .data = &xfs_params.blockgc_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = xfs_deprecated_dointvec_minmax, + .extra1 = &xfs_params.blockgc_timer.min, + .extra2 = &xfs_params.blockgc_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 44f72c09c203..b22a09e9daee 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -72,6 +72,7 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); trace_xfs_trans_free(tp, _RET_IP_); + xfs_trans_clear_context(tp); if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); @@ -123,7 +124,8 @@ xfs_trans_dup( ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; tp->t_rtx_res = tp->t_rtx_res_used; - ntp->t_pflags = tp->t_pflags; + + xfs_trans_switch_context(tp, ntp); /* move deferred ops over to the new tp */ xfs_defer_move(ntp, tp); @@ -157,9 +159,6 @@ xfs_trans_reserve( int error = 0; bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; - /* Mark this thread as being in a transaction */ - current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - /* * Attempt to reserve the needed disk blocks by decrementing * the number needed from the number available. This will @@ -167,10 +166,8 @@ xfs_trans_reserve( */ if (blocks > 0) { error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd); - if (error != 0) { - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); + if (error != 0) return -ENOSPC; - } tp->t_blk_res += blocks; } @@ -244,9 +241,6 @@ undo_blocks: xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } - - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - return error; } @@ -260,6 +254,7 @@ xfs_trans_alloc( struct xfs_trans **tpp) { struct xfs_trans *tp; + bool want_retry = true; int error; /* @@ -267,9 +262,11 @@ xfs_trans_alloc( * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ +retry: tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); + xfs_trans_set_context(tp); /* * Zero-reservation ("empty") transactions can't modify anything, so @@ -289,7 +286,9 @@ xfs_trans_alloc( tp->t_firstblock = NULLFSBLOCK; error = xfs_trans_reserve(tp, resp, blocks, rtextents); - if (error == -ENOSPC) { + if (error == -ENOSPC && want_retry) { + xfs_trans_cancel(tp); + /* * We weren't able to reserve enough space for the transaction. * Flush the other speculative space allocations to free space. @@ -297,8 +296,11 @@ xfs_trans_alloc( * other locks. */ error = xfs_blockgc_free_space(mp, NULL); - if (!error) - error = xfs_trans_reserve(tp, resp, blocks, rtextents); + if (error) + return error; + + want_retry = false; + goto retry; } if (error) { xfs_trans_cancel(tp); @@ -893,7 +895,6 @@ __xfs_trans_commit( xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free(tp); /* @@ -925,7 +926,6 @@ out_unreserve: xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, !!error); xfs_trans_free(tp); @@ -985,9 +985,6 @@ xfs_trans_cancel( tp->t_ticket = NULL; } - /* mark this thread as no longer being in a transaction */ - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - xfs_trans_free_items(tp, dirty); xfs_trans_free(tp); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 8b03fbfe9a1b..9dd745cf77c9 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -281,4 +281,34 @@ int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force, struct xfs_trans **tpp); +static inline void +xfs_trans_set_context( + struct xfs_trans *tp) +{ + ASSERT(current->journal_info == NULL); + tp->t_pflags = memalloc_nofs_save(); + current->journal_info = tp; +} + +static inline void +xfs_trans_clear_context( + struct xfs_trans *tp) +{ + if (current->journal_info == tp) { + memalloc_nofs_restore(tp->t_pflags); + current->journal_info = NULL; + } +} + +static inline void +xfs_trans_switch_context( + struct xfs_trans *old_tp, + struct xfs_trans *new_tp) +{ + ASSERT(current->journal_info == old_tp); + new_tp->t_pflags = old_tp->t_pflags; + old_tp->t_pflags = 0; + current->journal_info = new_tp; +} + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index bca48b308c02..12be32f66dc1 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -38,9 +38,10 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, } static int -xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, - struct inode *inode, const char *name, const void *value, - size_t size, int flags) +xfs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, + struct inode *inode, const char *name, const void *value, + size_t size, int flags) { struct xfs_da_args args = { .dp = XFS_I(inode), diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index f3115432b43a..b6ff4a21abac 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -489,7 +489,8 @@ unlock: return ret; } -static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr) +static int zonefs_inode_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int ret; @@ -497,7 +498,7 @@ static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr) if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; - ret = setattr_prepare(dentry, iattr); + ret = setattr_prepare(&init_user_ns, dentry, iattr); if (ret) return ret; @@ -525,7 +526,7 @@ static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr) return ret; } - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); return 0; } @@ -1233,7 +1234,7 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, struct super_block *sb = parent->i_sb; inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1; - inode_init_owner(inode, parent, S_IFDIR | 0555); + inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); |