diff options
Diffstat (limited to 'fs')
272 files changed, 9777 insertions, 11282 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 5a0db6dec8d1..aaee1e6584e6 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -40,6 +40,9 @@ */ #define P9_LOCK_TIMEOUT (30*HZ) +/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */ +#define V9FS_STAT2INODE_KEEP_ISIZE 1 + extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; extern const struct file_operations v9fs_file_operations; @@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); -void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); -void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); +void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, + struct super_block *sb, unsigned int flags); +void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); @@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) } int v9fs_open_to_dotl_flags(int flags); + +static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size) +{ + /* + * 32-bit need the lock, concurrent updates could break the + * sequences and make i_size_read() loop forever. + * 64-bit updates are atomic and can skip the locking. + */ + if (sizeof(i_size) > sizeof(long)) + spin_lock(&inode->i_lock); + i_size_write(inode, i_size); + if (sizeof(i_size) > sizeof(long)) + spin_unlock(&inode->i_lock); +} #endif diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index a25efa782fcc..9a1125305d84 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -446,7 +446,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) i_size = i_size_read(inode); if (iocb->ki_pos > i_size) { inode_add_bytes(inode, iocb->ki_pos - i_size); - i_size_write(inode, iocb->ki_pos); + /* + * Need to serialize against i_size_write() in + * v9fs_stat2inode() + */ + v9fs_i_size_write(inode, iocb->ki_pos); } return retval; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 85ff859d3af5..72b779bc0942 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode(st, inode, sb); + v9fs_stat2inode(st, inode, sb, 0); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); return inode; @@ -1092,7 +1092,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb); + v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); generic_fillattr(d_inode(dentry), stat); p9stat_free(st); @@ -1170,12 +1170,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) * @stat: Plan 9 metadata (mistat) structure * @inode: inode to populate * @sb: superblock of filesystem + * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, - struct super_block *sb) + struct super_block *sb, unsigned int flags) { umode_t mode; char ext[32]; @@ -1216,10 +1217,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode = p9mode2perm(v9ses, stat); mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->length); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ - inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + inode->i_blocks = (stat->length + 512 - 1) >> 9; v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } @@ -1416,9 +1418,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) { int umode; dev_t rdev; - loff_t i_size; struct p9_wstat *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_stat(fid); @@ -1431,16 +1433,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode(st, inode, inode->i_sb); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode(st, inode, inode->i_sb, flags); out: p9stat_free(st); kfree(st); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 4823e1c46999..a950a927a626 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode_dotl(st, inode); + v9fs_stat2inode_dotl(st, inode, 0); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) @@ -496,7 +496,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode_dotl(st, d_inode(dentry)); + v9fs_stat2inode_dotl(st, d_inode(dentry), 0); generic_fillattr(d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -607,11 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate + * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void -v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags) { umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); @@ -631,7 +633,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; } else { if (stat->st_result_mask & P9_STATS_ATIME) { @@ -661,8 +664,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) } if (stat->st_result_mask & P9_STATS_RDEV) inode->i_rdev = new_decode_dev(stat->st_rdev); - if (stat->st_result_mask & P9_STATS_SIZE) - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && + stat->st_result_mask & P9_STATS_SIZE) + v9fs_i_size_write(inode, stat->st_size); if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } @@ -928,9 +932,9 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry, int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) { - loff_t i_size; struct p9_stat_dotl *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_getattr_dotl(fid, P9_STATS_ALL); @@ -942,16 +946,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode_dotl(st, inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode_dotl(st, inode, flags); out: kfree(st); return 0; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 48ce50484e80..d13d35cf69c0 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -92,7 +92,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, return ret; if (v9ses->cache) - sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; if (!v9ses->cache) @@ -172,7 +172,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode_dotl(st, d_inode(root)); + v9fs_stat2inode_dotl(st, d_inode(root), 0); kfree(st); } else { struct p9_wstat *st = NULL; @@ -183,7 +183,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, d_inode(root), sb); + v9fs_stat2inode(st, d_inode(root), sb, 0); p9stat_free(st); kfree(st); diff --git a/fs/Kconfig b/fs/Kconfig index ac474a61be37..3e6d3101f3ff 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -8,6 +8,13 @@ menu "File systems" config DCACHE_WORD_ACCESS bool +config VALIDATE_FS_PARSER + bool "Validate filesystem parameter description" + default y + help + Enable this to perform validation of the parameter description for a + filesystem when it is registered. + if BLOCK config FS_IOMAP @@ -254,12 +261,9 @@ source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" -source "fs/exofs/Kconfig" endif # MISC_FILESYSTEMS -source "fs/exofs/Kconfig.ore" - menuconfig NETWORK_FILESYSTEMS bool "Network File Systems" default y diff --git a/fs/Makefile b/fs/Makefile index ffeaa6632ab4..427fec226fae 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -13,7 +13,7 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o + fs_types.o fs_context.o fs_parser.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o @@ -126,7 +126,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_F2FS_FS) += f2fs/ -obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 1c7955f5cdaf..128f2dbe256a 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -203,8 +203,7 @@ void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi) */ void afs_init_callback_state(struct afs_server *server) { - if (!test_and_clear_bit(AFS_SERVER_FL_NEW, &server->flags)) - server->cb_s_break++; + server->cb_s_break++; } /* diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 8ee5972893ed..2f8acb4c556d 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -34,7 +34,7 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *); static int afs_deliver_yfs_cb_callback(struct afs_call *); #define CM_NAME(name) \ - const char afs_SRXCB##name##_name[] __tracepoint_string = \ + char afs_SRXCB##name##_name[] __tracepoint_string = \ "CB." #name /* diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index ca08c83168f5..0b37867b5c20 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1515,8 +1515,8 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr) xdr_encode_AFS_StoreStatus(&bp, attr); - *bp++ = 0; /* position of start of write */ - *bp++ = 0; + *bp++ = htonl(attr->ia_size >> 32); /* position of start of write */ + *bp++ = htonl((u32) attr->ia_size); *bp++ = 0; /* size of write */ *bp++ = 0; *bp++ = htonl(attr->ia_size >> 32); /* new file length */ @@ -1564,7 +1564,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) xdr_encode_AFS_StoreStatus(&bp, attr); - *bp++ = 0; /* position of start of write */ + *bp++ = htonl(attr->ia_size); /* position of start of write */ *bp++ = 0; /* size of write */ *bp++ = htonl(attr->ia_size); /* new file length */ diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1a4ce07fb406..9cedc3fc1b77 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -216,9 +216,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) set_nlink(inode, 2); inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; - inode->i_ctime.tv_sec = get_seconds(); - inode->i_ctime.tv_nsec = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime; + inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode); inode->i_blocks = 0; inode_set_iversion_raw(inode, 0); inode->i_generation = 0; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 8871b9e8645f..3904ab0b9563 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -36,15 +36,14 @@ struct pagevec; struct afs_call; -struct afs_mount_params { - bool rwpath; /* T if the parent should be considered R/W */ +struct afs_fs_context { bool force; /* T to force cell type */ bool autocell; /* T if set auto mount operation */ bool dyn_root; /* T if dynamic root */ + bool no_cell; /* T if the source is "none" (for dynroot) */ afs_voltype_t type; /* type of volume requested */ - int volnamesz; /* size of volume name */ + unsigned int volnamesz; /* size of volume name */ const char *volname; /* name of volume to mount */ - struct net *net_ns; /* Network namespace in effect */ struct afs_net *net; /* the AFS net namespace stuff */ struct afs_cell *cell; /* cell in which to find volume */ struct afs_volume *volume; /* volume record */ @@ -475,7 +474,6 @@ struct afs_server { time64_t put_time; /* Time at which last put */ time64_t update_at; /* Time at which to next update the record */ unsigned long flags; -#define AFS_SERVER_FL_NEW 0 /* New server, don't inc cb_s_break */ #define AFS_SERVER_FL_NOT_READY 1 /* The record is not ready for use */ #define AFS_SERVER_FL_NOT_FOUND 2 /* VL server says no such server */ #define AFS_SERVER_FL_VL_FAIL 3 /* Failed to access VL server */ @@ -828,7 +826,7 @@ static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) { - return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break; + return vnode->cb_break + vnode->cb_v_break; } static inline bool afs_cb_is_broken(unsigned int cb_break, @@ -836,7 +834,6 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, const struct afs_cb_interest *cbi) { return !cbi || cb_break != (vnode->cb_break + - cbi->server->cb_s_break + vnode->volume->cb_v_break); } @@ -1274,7 +1271,7 @@ static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume) return volume; } -extern struct afs_volume *afs_create_volume(struct afs_mount_params *); +extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern void afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); extern void afs_put_volume(struct afs_cell *, struct afs_volume *); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 2e51c6994148..eecd8b699186 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -17,6 +17,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/gfp.h> +#include <linux/fs_context.h> #include "internal.h" @@ -47,6 +48,8 @@ static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); static unsigned long afs_mntpt_expiry_timeout = 10 * 60; +static const char afs_root_volume[] = "root.cell"; + /* * no valid lookup procedure on this sort of dir */ @@ -68,108 +71,112 @@ static int afs_mntpt_open(struct inode *inode, struct file *file) } /* - * create a vfsmount to be automounted + * Set the parameters for the proposed superblock. */ -static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) { - struct afs_super_info *as; - struct vfsmount *mnt; - struct afs_vnode *vnode; - struct page *page; - char *devname, *options; - bool rwpath = false; + struct afs_fs_context *ctx = fc->fs_private; + struct afs_super_info *src_as = AFS_FS_S(mntpt->d_sb); + struct afs_vnode *vnode = AFS_FS_I(d_inode(mntpt)); + struct afs_cell *cell; + const char *p; int ret; - _enter("{%pd}", mntpt); - - BUG_ON(!d_inode(mntpt)); - - ret = -ENOMEM; - devname = (char *) get_zeroed_page(GFP_KERNEL); - if (!devname) - goto error_no_devname; - - options = (char *) get_zeroed_page(GFP_KERNEL); - if (!options) - goto error_no_options; + if (fc->net_ns != src_as->net_ns) { + put_net(fc->net_ns); + fc->net_ns = get_net(src_as->net_ns); + } - vnode = AFS_FS_I(d_inode(mntpt)); + if (src_as->volume && src_as->volume->type == AFSVL_RWVOL) { + ctx->type = AFSVL_RWVOL; + ctx->force = true; + } + if (ctx->cell) { + afs_put_cell(ctx->net, ctx->cell); + ctx->cell = NULL; + } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { /* if the directory is a pseudo directory, use the d_name */ - static const char afs_root_cell[] = ":root.cell."; unsigned size = mntpt->d_name.len; - ret = -ENOENT; - if (size < 2 || size > AFS_MAXCELLNAME) - goto error_no_page; + if (size < 2) + return -ENOENT; + p = mntpt->d_name.name; if (mntpt->d_name.name[0] == '.') { - devname[0] = '%'; - memcpy(devname + 1, mntpt->d_name.name + 1, size - 1); - memcpy(devname + size, afs_root_cell, - sizeof(afs_root_cell)); - rwpath = true; - } else { - devname[0] = '#'; - memcpy(devname + 1, mntpt->d_name.name, size); - memcpy(devname + size + 1, afs_root_cell, - sizeof(afs_root_cell)); + size--; + p++; + ctx->type = AFSVL_RWVOL; + ctx->force = true; } + if (size > AFS_MAXCELLNAME) + return -ENAMETOOLONG; + + cell = afs_lookup_cell(ctx->net, p, size, NULL, false); + if (IS_ERR(cell)) { + pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); + return PTR_ERR(cell); + } + ctx->cell = cell; + + ctx->volname = afs_root_volume; + ctx->volnamesz = sizeof(afs_root_volume) - 1; } else { /* read the contents of the AFS special symlink */ + struct page *page; loff_t size = i_size_read(d_inode(mntpt)); char *buf; - ret = -EINVAL; + if (src_as->cell) + ctx->cell = afs_get_cell(src_as->cell); + if (size > PAGE_SIZE - 1) - goto error_no_page; + return -EINVAL; page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto error_no_page; - } + if (IS_ERR(page)) + return PTR_ERR(page); if (PageError(page)) { ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt); - goto error; + put_page(page); + return ret; } - buf = kmap_atomic(page); - memcpy(devname, buf, size); - kunmap_atomic(buf); + buf = kmap(page); + ret = vfs_parse_fs_string(fc, "source", buf, size); + kunmap(page); put_page(page); - page = NULL; + if (ret < 0) + return ret; } - /* work out what options we want */ - as = AFS_FS_S(mntpt->d_sb); - if (as->cell) { - memcpy(options, "cell=", 5); - strcpy(options + 5, as->cell->name); - if ((as->volume && as->volume->type == AFSVL_RWVOL) || rwpath) - strcat(options, ",rwpath"); - } + return 0; +} - /* try and do the mount */ - _debug("--- attempting mount %s -o %s ---", devname, options); - mnt = vfs_submount(mntpt, &afs_fs_type, devname, options); - _debug("--- mount result %p ---", mnt); +/* + * create a vfsmount to be automounted + */ +static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +{ + struct fs_context *fc; + struct vfsmount *mnt; + int ret; - free_page((unsigned long) devname); - free_page((unsigned long) options); - _leave(" = %p", mnt); - return mnt; + BUG_ON(!d_inode(mntpt)); -error: - put_page(page); -error_no_page: - free_page((unsigned long) options); -error_no_options: - free_page((unsigned long) devname); -error_no_devname: - _leave(" = %d", ret); - return ERR_PTR(ret); + fc = fs_context_for_submount(&afs_fs_type, mntpt); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + ret = afs_mntpt_set_params(fc, mntpt); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + + put_fs_context(fc); + return mnt; } /* diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 2c588f9bbbda..15c7e82d80cb 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -572,13 +572,17 @@ static void afs_deliver_to_call(struct afs_call *call) case -ENODATA: case -EBADMSG: case -EMSGSIZE: - default: abort_code = RXGEN_CC_UNMARSHAL; if (state != AFS_CALL_CL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KUM"); goto local_abort; + default: + abort_code = RX_USER_ABORT; + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + abort_code, ret, "KER"); + goto local_abort; } } @@ -610,6 +614,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, bool stalled = false; u64 rtt; u32 life, last_life; + bool rxrpc_complete = false; DECLARE_WAITQUEUE(myself, current); @@ -621,7 +626,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, rtt2 = 2; timeout = rtt2; - last_life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); + rxrpc_kernel_check_life(call->net->socket, call->rxcall, &last_life); add_wait_queue(&call->waitq, &myself); for (;;) { @@ -639,7 +644,12 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, if (afs_check_call_state(call, AFS_CALL_COMPLETE)) break; - life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); + if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall, &life)) { + /* rxrpc terminated the call. */ + rxrpc_complete = true; + break; + } + if (timeout == 0 && life == last_life && signal_pending(current)) { if (stalled) @@ -663,12 +673,16 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); - /* Kill off the call if it's still live. */ if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { - _debug("call interrupted"); - if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - RX_USER_ABORT, -EINTR, "KWI")) - afs_set_call_complete(call, -EINTR, 0); + if (rxrpc_complete) { + afs_set_call_complete(call, call->error, call->abort_code); + } else { + /* Kill off the call if it's still live. */ + _debug("call interrupted"); + if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + RX_USER_ABORT, -EINTR, "KWI")) + afs_set_call_complete(call, -EINTR, 0); + } } spin_lock_bh(&call->state_lock); diff --git a/fs/afs/server.c b/fs/afs/server.c index 642afa2e9783..65b33b6da48b 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -226,7 +226,6 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, RCU_INIT_POINTER(server->addresses, alist); server->addr_version = alist->version; server->uuid = *uuid; - server->flags = (1UL << AFS_SERVER_FL_NEW); server->update_at = ktime_get_real_seconds() + afs_server_update_delay; rwlock_init(&server->fs_lock); INIT_HLIST_HEAD(&server->cb_volumes); diff --git a/fs/afs/super.c b/fs/afs/super.c index dcd07fe99871..5adf012b8e27 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -1,6 +1,6 @@ /* AFS superblock handling * - * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved. + * Copyright (c) 2002, 2007, 2018 Red Hat, Inc. All rights reserved. * * This software may be freely redistributed under the terms of the * GNU General Public License. @@ -21,7 +21,7 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/statfs.h> #include <linux/sched.h> #include <linux/nsproxy.h> @@ -30,21 +30,22 @@ #include "internal.h" static void afs_i_init_once(void *foo); -static struct dentry *afs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data); static void afs_kill_super(struct super_block *sb); static struct inode *afs_alloc_inode(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); static int afs_show_devname(struct seq_file *m, struct dentry *root); static int afs_show_options(struct seq_file *m, struct dentry *root); +static int afs_init_fs_context(struct fs_context *fc); +static const struct fs_parameter_description afs_fs_parameters; struct file_system_type afs_fs_type = { - .owner = THIS_MODULE, - .name = "afs", - .mount = afs_mount, - .kill_sb = afs_kill_super, - .fs_flags = 0, + .owner = THIS_MODULE, + .name = "afs", + .init_fs_context = afs_init_fs_context, + .parameters = &afs_fs_parameters, + .kill_sb = afs_kill_super, + .fs_flags = 0, }; MODULE_ALIAS_FS("afs"); @@ -63,22 +64,22 @@ static const struct super_operations afs_super_ops = { static struct kmem_cache *afs_inode_cachep; static atomic_t afs_count_active_inodes; -enum { - afs_no_opt, - afs_opt_cell, - afs_opt_dyn, - afs_opt_rwpath, - afs_opt_vol, - afs_opt_autocell, +enum afs_param { + Opt_autocell, + Opt_dyn, + Opt_source, }; -static const match_table_t afs_options_list = { - { afs_opt_cell, "cell=%s" }, - { afs_opt_dyn, "dyn" }, - { afs_opt_rwpath, "rwpath" }, - { afs_opt_vol, "vol=%s" }, - { afs_opt_autocell, "autocell" }, - { afs_no_opt, NULL }, +static const struct fs_parameter_spec afs_param_specs[] = { + fsparam_flag ("autocell", Opt_autocell), + fsparam_flag ("dyn", Opt_dyn), + fsparam_string("source", Opt_source), + {} +}; + +static const struct fs_parameter_description afs_fs_parameters = { + .name = "kAFS", + .specs = afs_param_specs, }; /* @@ -190,84 +191,23 @@ static int afs_show_options(struct seq_file *m, struct dentry *root) } /* - * parse the mount options - * - this function has been shamelessly adapted from the ext3 fs which - * shamelessly adapted it from the msdos fs - */ -static int afs_parse_options(struct afs_mount_params *params, - char *options, const char **devname) -{ - struct afs_cell *cell; - substring_t args[MAX_OPT_ARGS]; - char *p; - int token; - - _enter("%s", options); - - options[PAGE_SIZE - 1] = 0; - - while ((p = strsep(&options, ","))) { - if (!*p) - continue; - - token = match_token(p, afs_options_list, args); - switch (token) { - case afs_opt_cell: - rcu_read_lock(); - cell = afs_lookup_cell_rcu(params->net, - args[0].from, - args[0].to - args[0].from); - rcu_read_unlock(); - if (IS_ERR(cell)) - return PTR_ERR(cell); - afs_put_cell(params->net, params->cell); - params->cell = cell; - break; - - case afs_opt_rwpath: - params->rwpath = true; - break; - - case afs_opt_vol: - *devname = args[0].from; - break; - - case afs_opt_autocell: - params->autocell = true; - break; - - case afs_opt_dyn: - params->dyn_root = true; - break; - - default: - printk(KERN_ERR "kAFS:" - " Unknown or invalid mount option: '%s'\n", p); - return -EINVAL; - } - } - - _leave(" = 0"); - return 0; -} - -/* - * parse a device name to get cell name, volume name, volume type and R/W - * selector - * - this can be one of the following: + * Parse the source name to get cell name, volume name, volume type and R/W + * selector. + * + * This can be one of the following: * "%[cell:]volume[.]" R/W volume - * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0), - * or R/W (rwpath=1) volume + * "#[cell:]volume[.]" R/O or R/W volume (R/O parent), + * or R/W (R/W parent) volume * "%[cell:]volume.readonly" R/O volume * "#[cell:]volume.readonly" R/O volume * "%[cell:]volume.backup" Backup volume * "#[cell:]volume.backup" Backup volume */ -static int afs_parse_device_name(struct afs_mount_params *params, - const char *name) +static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) { + struct afs_fs_context *ctx = fc->fs_private; struct afs_cell *cell; - const char *cellname, *suffix; + const char *cellname, *suffix, *name = param->string; int cellnamesz; _enter(",%s", name); @@ -278,69 +218,149 @@ static int afs_parse_device_name(struct afs_mount_params *params, } if ((name[0] != '%' && name[0] != '#') || !name[1]) { + /* To use dynroot, we don't want to have to provide a source */ + if (strcmp(name, "none") == 0) { + ctx->no_cell = true; + return 0; + } printk(KERN_ERR "kAFS: unparsable volume name\n"); return -EINVAL; } /* determine the type of volume we're looking for */ - params->type = AFSVL_ROVOL; - params->force = false; - if (params->rwpath || name[0] == '%') { - params->type = AFSVL_RWVOL; - params->force = true; + if (name[0] == '%') { + ctx->type = AFSVL_RWVOL; + ctx->force = true; } name++; /* split the cell name out if there is one */ - params->volname = strchr(name, ':'); - if (params->volname) { + ctx->volname = strchr(name, ':'); + if (ctx->volname) { cellname = name; - cellnamesz = params->volname - name; - params->volname++; + cellnamesz = ctx->volname - name; + ctx->volname++; } else { - params->volname = name; + ctx->volname = name; cellname = NULL; cellnamesz = 0; } /* the volume type is further affected by a possible suffix */ - suffix = strrchr(params->volname, '.'); + suffix = strrchr(ctx->volname, '.'); if (suffix) { if (strcmp(suffix, ".readonly") == 0) { - params->type = AFSVL_ROVOL; - params->force = true; + ctx->type = AFSVL_ROVOL; + ctx->force = true; } else if (strcmp(suffix, ".backup") == 0) { - params->type = AFSVL_BACKVOL; - params->force = true; + ctx->type = AFSVL_BACKVOL; + ctx->force = true; } else if (suffix[1] == 0) { } else { suffix = NULL; } } - params->volnamesz = suffix ? - suffix - params->volname : strlen(params->volname); + ctx->volnamesz = suffix ? + suffix - ctx->volname : strlen(ctx->volname); _debug("cell %*.*s [%p]", - cellnamesz, cellnamesz, cellname ?: "", params->cell); + cellnamesz, cellnamesz, cellname ?: "", ctx->cell); /* lookup the cell record */ - if (cellname || !params->cell) { - cell = afs_lookup_cell(params->net, cellname, cellnamesz, + if (cellname) { + cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, NULL, false); if (IS_ERR(cell)) { - printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n", + pr_err("kAFS: unable to lookup cell '%*.*s'\n", cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_put_cell(params->net, params->cell); - params->cell = cell; + afs_put_cell(ctx->net, ctx->cell); + ctx->cell = cell; } _debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s", - params->cell->name, params->cell, - params->volnamesz, params->volnamesz, params->volname, - suffix ?: "-", params->type, params->force ? " FORCE" : ""); + ctx->cell->name, ctx->cell, + ctx->volnamesz, ctx->volnamesz, ctx->volname, + suffix ?: "-", ctx->type, ctx->force ? " FORCE" : ""); + + fc->source = param->string; + param->string = NULL; + return 0; +} + +/* + * Parse a single mount parameter. + */ +static int afs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct fs_parse_result result; + struct afs_fs_context *ctx = fc->fs_private; + int opt; + + opt = fs_parse(fc, &afs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_source: + return afs_parse_source(fc, param); + + case Opt_autocell: + ctx->autocell = true; + break; + + case Opt_dyn: + ctx->dyn_root = true; + break; + + default: + return -EINVAL; + } + + _leave(" = 0"); + return 0; +} + +/* + * Validate the options, get the cell key and look up the volume. + */ +static int afs_validate_fc(struct fs_context *fc) +{ + struct afs_fs_context *ctx = fc->fs_private; + struct afs_volume *volume; + struct key *key; + + if (!ctx->dyn_root) { + if (ctx->no_cell) { + pr_warn("kAFS: Can only specify source 'none' with -o dyn\n"); + return -EINVAL; + } + + if (!ctx->cell) { + pr_warn("kAFS: No cell specified\n"); + return -EDESTADDRREQ; + } + + /* We try to do the mount securely. */ + key = afs_request_key(ctx->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + + ctx->key = key; + + if (ctx->volume) { + afs_put_volume(ctx->cell, ctx->volume); + ctx->volume = NULL; + } + + volume = afs_create_volume(ctx); + if (IS_ERR(volume)) + return PTR_ERR(volume); + + ctx->volume = volume; + } return 0; } @@ -348,39 +368,34 @@ static int afs_parse_device_name(struct afs_mount_params *params, /* * check a superblock to see if it's the one we're looking for */ -static int afs_test_super(struct super_block *sb, void *data) +static int afs_test_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as1 = data; + struct afs_fs_context *ctx = fc->fs_private; struct afs_super_info *as = AFS_FS_S(sb); - return (as->net_ns == as1->net_ns && + return (as->net_ns == fc->net_ns && as->volume && - as->volume->vid == as1->volume->vid && + as->volume->vid == ctx->volume->vid && !as->dyn_root); } -static int afs_dynroot_test_super(struct super_block *sb, void *data) +static int afs_dynroot_test_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as1 = data; struct afs_super_info *as = AFS_FS_S(sb); - return (as->net_ns == as1->net_ns && + return (as->net_ns == fc->net_ns && as->dyn_root); } -static int afs_set_super(struct super_block *sb, void *data) +static int afs_set_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as = data; - - sb->s_fs_info = as; return set_anon_super(sb, NULL); } /* * fill in the superblock */ -static int afs_fill_super(struct super_block *sb, - struct afs_mount_params *params) +static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) { struct afs_super_info *as = AFS_FS_S(sb); struct afs_fid fid; @@ -399,7 +414,7 @@ static int afs_fill_super(struct super_block *sb, ret = super_setup_bdi(sb); if (ret) return ret; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* allocate the root inode and dentry */ if (as->dyn_root) { @@ -412,13 +427,13 @@ static int afs_fill_super(struct super_block *sb, fid.vnode = 1; fid.vnode_hi = 0; fid.unique = 1; - inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL); + inode = afs_iget(sb, ctx->key, &fid, NULL, NULL, NULL); } if (IS_ERR(inode)) return PTR_ERR(inode); - if (params->autocell || params->dyn_root) + if (ctx->autocell || as->dyn_root) set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); ret = -ENOMEM; @@ -443,17 +458,20 @@ error: return ret; } -static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params) +static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) { + struct afs_fs_context *ctx = fc->fs_private; struct afs_super_info *as; as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); if (as) { - as->net_ns = get_net(params->net_ns); - if (params->dyn_root) + as->net_ns = get_net(fc->net_ns); + if (ctx->dyn_root) { as->dyn_root = true; - else - as->cell = afs_get_cell(params->cell); + } else { + as->cell = afs_get_cell(ctx->cell); + as->volume = __afs_get_volume(ctx->volume); + } } return as; } @@ -475,7 +493,7 @@ static void afs_kill_super(struct super_block *sb) if (as->dyn_root) afs_dynroot_depopulate(sb); - + /* Clear the callback interests (which will do ilookup5) before * deactivating the superblock. */ @@ -488,111 +506,103 @@ static void afs_kill_super(struct super_block *sb) } /* - * get an AFS superblock + * Get an AFS superblock and root directory. */ -static struct dentry *afs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *options) +static int afs_get_tree(struct fs_context *fc) { - struct afs_mount_params params; + struct afs_fs_context *ctx = fc->fs_private; struct super_block *sb; - struct afs_volume *candidate; - struct key *key; struct afs_super_info *as; int ret; - _enter(",,%s,%p", dev_name, options); - - memset(¶ms, 0, sizeof(params)); - - ret = -EINVAL; - if (current->nsproxy->net_ns != &init_net) + ret = afs_validate_fc(fc); + if (ret) goto error; - params.net_ns = current->nsproxy->net_ns; - params.net = afs_net(params.net_ns); - - /* parse the options and device name */ - if (options) { - ret = afs_parse_options(¶ms, options, &dev_name); - if (ret < 0) - goto error; - } - - if (!params.dyn_root) { - ret = afs_parse_device_name(¶ms, dev_name); - if (ret < 0) - goto error; - /* try and do the mount securely */ - key = afs_request_key(params.cell); - if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - ret = PTR_ERR(key); - goto error; - } - params.key = key; - } + _enter(""); /* allocate a superblock info record */ ret = -ENOMEM; - as = afs_alloc_sbi(¶ms); + as = afs_alloc_sbi(fc); if (!as) - goto error_key; - - if (!params.dyn_root) { - /* Assume we're going to need a volume record; at the very - * least we can use it to update the volume record if we have - * one already. This checks that the volume exists within the - * cell. - */ - candidate = afs_create_volume(¶ms); - if (IS_ERR(candidate)) { - ret = PTR_ERR(candidate); - goto error_as; - } - - as->volume = candidate; - } + goto error; + fc->s_fs_info = as; /* allocate a deviceless superblock */ - sb = sget(fs_type, - as->dyn_root ? afs_dynroot_test_super : afs_test_super, - afs_set_super, flags, as); + sb = sget_fc(fc, + as->dyn_root ? afs_dynroot_test_super : afs_test_super, + afs_set_super); if (IS_ERR(sb)) { ret = PTR_ERR(sb); - goto error_as; + goto error; } if (!sb->s_root) { /* initial superblock/root creation */ _debug("create"); - ret = afs_fill_super(sb, ¶ms); + ret = afs_fill_super(sb, ctx); if (ret < 0) goto error_sb; - as = NULL; sb->s_flags |= SB_ACTIVE; } else { _debug("reuse"); ASSERTCMP(sb->s_flags, &, SB_ACTIVE); - afs_destroy_sbi(as); - as = NULL; } - afs_put_cell(params.net, params.cell); - key_put(params.key); + fc->root = dget(sb->s_root); _leave(" = 0 [%p]", sb); - return dget(sb->s_root); + return 0; error_sb: deactivate_locked_super(sb); - goto error_key; -error_as: - afs_destroy_sbi(as); -error_key: - key_put(params.key); error: - afs_put_cell(params.net, params.cell); _leave(" = %d", ret); - return ERR_PTR(ret); + return ret; +} + +static void afs_free_fc(struct fs_context *fc) +{ + struct afs_fs_context *ctx = fc->fs_private; + + afs_destroy_sbi(fc->s_fs_info); + afs_put_volume(ctx->cell, ctx->volume); + afs_put_cell(ctx->net, ctx->cell); + key_put(ctx->key); + kfree(ctx); +} + +static const struct fs_context_operations afs_context_ops = { + .free = afs_free_fc, + .parse_param = afs_parse_param, + .get_tree = afs_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int afs_init_fs_context(struct fs_context *fc) +{ + struct afs_fs_context *ctx; + struct afs_cell *cell; + + ctx = kzalloc(sizeof(struct afs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->type = AFSVL_ROVOL; + ctx->net = afs_net(fc->net_ns); + + /* Default to the workstation cell. */ + rcu_read_lock(); + cell = afs_lookup_cell_rcu(ctx->net, NULL, 0); + rcu_read_unlock(); + if (IS_ERR(cell)) + cell = NULL; + ctx->cell = cell; + + fc->fs_private = ctx; + fc->ops = &afs_context_ops; + return 0; } /* diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 00975ed3640f..f6eba2def0a1 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -21,7 +21,7 @@ static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" }; /* * Allocate a volume record and load it up from a vldb record. */ -static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params, +static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, struct afs_vldb_entry *vldb, unsigned long type_mask) { @@ -113,7 +113,7 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell, * - Rule 3: If parent volume is R/W, then only mount R/W volume unless * explicitly told otherwise */ -struct afs_volume *afs_create_volume(struct afs_mount_params *params) +struct afs_volume *afs_create_volume(struct afs_fs_context *params) { struct afs_vldb_entry *vldb; struct afs_volume *volume; diff --git a/fs/afs/write.c b/fs/afs/write.c index 72efcfcf9f95..0122d7445fba 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -264,6 +264,7 @@ static void afs_kill_pages(struct address_space *mapping, first = page->index + 1; lock_page(page); generic_error_remove_page(mapping, page); + unlock_page(page); } __pagevec_release(&pv); diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 5aa57929e8c2..6e97a42d24d1 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -1514,7 +1514,7 @@ static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_YFSFid(bp, &vnode->fid); bp = xdr_encode_YFS_StoreStatus(bp, attr); - bp = xdr_encode_u64(bp, 0); /* position of start of write */ + bp = xdr_encode_u64(bp, attr->ia_size); /* position of start of write */ bp = xdr_encode_u64(bp, 0); /* size of write */ bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */ yfs_check_req(call, bp); @@ -181,7 +181,7 @@ struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool woken; + bool done; bool cancelled; struct wait_queue_entry wait; struct work_struct work; @@ -204,8 +204,7 @@ struct aio_kiocb { struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; - struct iocb __user *ki_user_iocb; /* user's aiocb */ - __u64 ki_user_data; /* user's data for completion */ + struct io_event ki_res; struct list_head ki_list; /* the aio core uses this * for cancellation */ @@ -1022,6 +1021,9 @@ static bool get_reqs_available(struct kioctx *ctx) /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. + * + * The refcount is initialized to 2 - one for the async op completion, + * one for the synchronous code that does this. */ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { @@ -1031,10 +1033,15 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) if (unlikely(!req)) return NULL; + if (unlikely(!get_reqs_available(ctx))) { + kmem_cache_free(kiocb_cachep, req); + return NULL; + } + percpu_ref_get(&ctx->reqs); req->ki_ctx = ctx; INIT_LIST_HEAD(&req->ki_list); - refcount_set(&req->ki_refcnt, 0); + refcount_set(&req->ki_refcnt, 2); req->ki_eventfd = NULL; return req; } @@ -1067,30 +1074,20 @@ out: return ret; } -static inline void iocb_put(struct aio_kiocb *iocb) -{ - if (refcount_read(&iocb->ki_refcnt) == 0 || - refcount_dec_and_test(&iocb->ki_refcnt)) { - if (iocb->ki_filp) - fput(iocb->ki_filp); - percpu_ref_put(&iocb->ki_ctx->reqs); - kmem_cache_free(kiocb_cachep, iocb); - } -} - -static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, - long res, long res2) +static inline void iocb_destroy(struct aio_kiocb *iocb) { - ev->obj = (u64)(unsigned long)iocb->ki_user_iocb; - ev->data = iocb->ki_user_data; - ev->res = res; - ev->res2 = res2; + if (iocb->ki_eventfd) + eventfd_ctx_put(iocb->ki_eventfd); + if (iocb->ki_filp) + fput(iocb->ki_filp); + percpu_ref_put(&iocb->ki_ctx->reqs); + kmem_cache_free(kiocb_cachep, iocb); } /* aio_complete * Called when the io request on the given iocb is complete. */ -static void aio_complete(struct aio_kiocb *iocb, long res, long res2) +static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; @@ -1114,14 +1111,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - aio_fill_event(event, iocb, res, res2); + *event = iocb->ki_res; kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); - pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, - res, res2); + pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, + (void __user *)(unsigned long)iocb->ki_res.obj, + iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); /* after flagging the request as done, we * must never even look at it again @@ -1148,10 +1145,8 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (iocb->ki_eventfd) { + if (iocb->ki_eventfd) eventfd_signal(iocb->ki_eventfd, 1); - eventfd_ctx_put(iocb->ki_eventfd); - } /* * We have to order our ring_info tail store above and test @@ -1163,7 +1158,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - iocb_put(iocb); +} + +static inline void iocb_put(struct aio_kiocb *iocb) +{ + if (refcount_dec_and_test(&iocb->ki_refcnt)) { + aio_complete(iocb); + iocb_destroy(iocb); + } } /* aio_read_events_ring @@ -1437,7 +1439,9 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) file_end_write(kiocb->ki_filp); } - aio_complete(iocb, res, res2); + iocb->ki_res.res = res; + iocb->ki_res.res2 = res2; + iocb_put(iocb); } static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) @@ -1514,13 +1518,13 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) } } -static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, +static int aio_read(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; - ssize_t ret; + int ret; ret = aio_prep_rw(req, iocb); if (ret) @@ -1542,13 +1546,13 @@ static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, return ret; } -static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, +static int aio_write(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; - ssize_t ret; + int ret; ret = aio_prep_rw(req, iocb); if (ret) @@ -1585,11 +1589,10 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, static void aio_fsync_work(struct work_struct *work) { - struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); - int ret; + struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); - ret = vfs_fsync(req->file, req->datasync); - aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); + iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + iocb_put(iocb); } static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, @@ -1608,11 +1611,6 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, return 0; } -static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) -{ - aio_complete(iocb, mangle_poll(mask), 0); -} - static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1638,9 +1636,11 @@ static void aio_poll_complete_work(struct work_struct *work) return; } list_del_init(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; spin_unlock_irq(&ctx->ctx_lock); - aio_poll_complete(iocb, mask); + iocb_put(iocb); } /* assumes we are called with irqs disabled */ @@ -1668,31 +1668,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, __poll_t mask = key_to_poll(key); unsigned long flags; - req->woken = true; - /* for instances that support it check for an event match first: */ - if (mask) { - if (!(mask & req->events)) - return 0; + if (mask && !(mask & req->events)) + return 0; + + list_del_init(&req->wait.entry); + if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { /* * Try to complete the iocb inline if we can. Use * irqsave/irqrestore because not all filesystems (e.g. fuse) * call this function with IRQs disabled and because IRQs * have to be disabled before ctx_lock is obtained. */ - if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { - list_del(&iocb->ki_list); - spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); - - list_del_init(&req->wait.entry); - aio_poll_complete(iocb, mask); - return 1; - } + list_del(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; + spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); + iocb_put(iocb); + } else { + schedule_work(&req->work); } - - list_del_init(&req->wait.entry); - schedule_work(&req->work); return 1; } @@ -1719,11 +1715,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, add_wait_queue(head, &pt->iocb->poll.wait); } -static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) +static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; struct aio_poll_table apt; + bool cancel = false; __poll_t mask; /* reject any unknown events outside the normal event mask. */ @@ -1737,7 +1734,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->head = NULL; - req->woken = false; + req->done = false; req->cancelled = false; apt.pt._qproc = aio_poll_queue_proc; @@ -1749,156 +1746,135 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); - /* one for removal from waitqueue, one for this function */ - refcount_set(&aiocb->ki_refcnt, 2); - mask = vfs_poll(req->file, &apt.pt) & req->events; - if (unlikely(!req->head)) { - /* we did not manage to set up a waitqueue, done */ - goto out; - } - spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - if (req->woken) { - /* wake_up context handles the rest */ - mask = 0; + if (likely(req->head)) { + spin_lock(&req->head->lock); + if (unlikely(list_empty(&req->wait.entry))) { + if (apt.error) + cancel = true; + apt.error = 0; + mask = 0; + } + if (mask || apt.error) { + list_del_init(&req->wait.entry); + } else if (cancel) { + WRITE_ONCE(req->cancelled, true); + } else if (!req->done) { /* actually waiting for an event */ + list_add_tail(&aiocb->ki_list, &ctx->active_reqs); + aiocb->ki_cancel = aio_poll_cancel; + } + spin_unlock(&req->head->lock); + } + if (mask) { /* no async, we'd stolen it */ + aiocb->ki_res.res = mangle_poll(mask); apt.error = 0; - } else if (mask || apt.error) { - /* if we get an error or a mask we are done */ - WARN_ON_ONCE(list_empty(&req->wait.entry)); - list_del_init(&req->wait.entry); - } else { - /* actually waiting for an event */ - list_add_tail(&aiocb->ki_list, &ctx->active_reqs); - aiocb->ki_cancel = aio_poll_cancel; } - spin_unlock(&req->head->lock); spin_unlock_irq(&ctx->ctx_lock); - -out: - if (unlikely(apt.error)) - return apt.error; - if (mask) - aio_poll_complete(aiocb, mask); - iocb_put(aiocb); - return 0; + iocb_put(aiocb); + return apt.error; } static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, - struct iocb __user *user_iocb, bool compat) + struct iocb __user *user_iocb, struct aio_kiocb *req, + bool compat) { - struct aio_kiocb *req; - ssize_t ret; - - /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved2)) { - pr_debug("EINVAL: reserve field set\n"); - return -EINVAL; - } - - /* prevent overflows */ - if (unlikely( - (iocb->aio_buf != (unsigned long)iocb->aio_buf) || - (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || - ((ssize_t)iocb->aio_nbytes < 0) - )) { - pr_debug("EINVAL: overflow check\n"); - return -EINVAL; - } - - if (!get_reqs_available(ctx)) - return -EAGAIN; - - ret = -EAGAIN; - req = aio_get_req(ctx); - if (unlikely(!req)) - goto out_put_reqs_available; - req->ki_filp = fget(iocb->aio_fildes); - ret = -EBADF; if (unlikely(!req->ki_filp)) - goto out_put_req; + return -EBADF; if (iocb->aio_flags & IOCB_FLAG_RESFD) { + struct eventfd_ctx *eventfd; /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ - req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); - if (IS_ERR(req->ki_eventfd)) { - ret = PTR_ERR(req->ki_eventfd); - req->ki_eventfd = NULL; - goto out_put_req; - } + eventfd = eventfd_ctx_fdget(iocb->aio_resfd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + req->ki_eventfd = eventfd; } - ret = put_user(KIOCB_KEY, &user_iocb->aio_key); - if (unlikely(ret)) { + if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) { pr_debug("EFAULT: aio_key\n"); - goto out_put_req; + return -EFAULT; } - req->ki_user_iocb = user_iocb; - req->ki_user_data = iocb->aio_data; + req->ki_res.obj = (u64)(unsigned long)user_iocb; + req->ki_res.data = iocb->aio_data; + req->ki_res.res = 0; + req->ki_res.res2 = 0; switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: - ret = aio_read(&req->rw, iocb, false, compat); - break; + return aio_read(&req->rw, iocb, false, compat); case IOCB_CMD_PWRITE: - ret = aio_write(&req->rw, iocb, false, compat); - break; + return aio_write(&req->rw, iocb, false, compat); case IOCB_CMD_PREADV: - ret = aio_read(&req->rw, iocb, true, compat); - break; + return aio_read(&req->rw, iocb, true, compat); case IOCB_CMD_PWRITEV: - ret = aio_write(&req->rw, iocb, true, compat); - break; + return aio_write(&req->rw, iocb, true, compat); case IOCB_CMD_FSYNC: - ret = aio_fsync(&req->fsync, iocb, false); - break; + return aio_fsync(&req->fsync, iocb, false); case IOCB_CMD_FDSYNC: - ret = aio_fsync(&req->fsync, iocb, true); - break; + return aio_fsync(&req->fsync, iocb, true); case IOCB_CMD_POLL: - ret = aio_poll(req, iocb); - break; + return aio_poll(req, iocb); default: pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); - ret = -EINVAL; - break; + return -EINVAL; } - - /* - * If ret is 0, we'd either done aio_complete() ourselves or have - * arranged for that to be done asynchronously. Anything non-zero - * means that we need to destroy req ourselves. - */ - if (ret) - goto out_put_req; - return 0; -out_put_req: - if (req->ki_eventfd) - eventfd_ctx_put(req->ki_eventfd); - iocb_put(req); -out_put_reqs_available: - put_reqs_available(ctx, 1); - return ret; } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, bool compat) { + struct aio_kiocb *req; struct iocb iocb; + int err; if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) return -EFAULT; - return __io_submit_one(ctx, &iocb, user_iocb, compat); + /* enforce forwards compatibility on users */ + if (unlikely(iocb.aio_reserved2)) { + pr_debug("EINVAL: reserve field set\n"); + return -EINVAL; + } + + /* prevent overflows */ + if (unlikely( + (iocb.aio_buf != (unsigned long)iocb.aio_buf) || + (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || + ((ssize_t)iocb.aio_nbytes < 0) + )) { + pr_debug("EINVAL: overflow check\n"); + return -EINVAL; + } + + req = aio_get_req(ctx); + if (unlikely(!req)) + return -EAGAIN; + + err = __io_submit_one(ctx, &iocb, user_iocb, req, compat); + + /* Done with the synchronous reference */ + iocb_put(req); + + /* + * If err is 0, we'd either done aio_complete() ourselves or have + * arranged for that to be done asynchronously. Anything non-zero + * means that we need to destroy req ourselves. + */ + if (unlikely(err)) { + iocb_destroy(req); + put_reqs_available(ctx, 1); + } + return err; } /* sys_io_submit: @@ -1997,24 +1973,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, } #endif -/* lookup_kiocb - * Finds a given iocb for cancellation. - */ -static struct aio_kiocb * -lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) -{ - struct aio_kiocb *kiocb; - - assert_spin_locked(&ctx->ctx_lock); - - /* TODO: use a hash or array, this sucks. */ - list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { - if (kiocb->ki_user_iocb == iocb) - return kiocb; - } - return NULL; -} - /* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is @@ -2032,6 +1990,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key; + u64 obj = (u64)(unsigned long)iocb; if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; @@ -2043,10 +2002,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - kiocb = lookup_kiocb(ctx, iocb); - if (kiocb) { - ret = kiocb->ki_cancel(&kiocb->rw); - list_del_init(&kiocb->ki_list); + /* TODO: use a hash or array, this sucks. */ + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { + if (kiocb->ki_res.obj == obj) { + ret = kiocb->ki_cancel(&kiocb->rw); + list_del_init(&kiocb->ki_list); + break; + } } spin_unlock_irq(&ctx->ctx_lock); diff --git a/fs/block_dev.c b/fs/block_dev.c index e9faa52bb489..bb28e2ead679 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -264,7 +264,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio_for_each_segment_all(bvec, &bio, i, iter_all) { if (should_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); + if (!bio_flagged(&bio, BIO_NO_PAGE_REF)) + put_page(bvec->bv_page); } if (unlikely(bio.bi_status)) @@ -307,10 +308,10 @@ static void blkdev_bio_end_io(struct bio *bio) struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->should_dirty; - if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_status && !dio->bio.bi_status) - dio->bio.bi_status = bio->bi_status; - } else { + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; + + if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; @@ -336,12 +337,14 @@ static void blkdev_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - struct bio_vec *bvec; - int i; - struct bvec_iter_all iter_all; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + int i; - bio_for_each_segment_all(bvec, bio, i, iter_all) - put_page(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter_all) + put_page(bvec->bv_page); + } bio_put(bio); } } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 129d26226e70..b3642367a595 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1210,6 +1210,8 @@ enum { * Set for the subvolume tree owning the reloc tree. */ BTRFS_ROOT_DEAD_RELOC_TREE, + /* Mark dead root stored on device whose cleanup needs to be resumed */ + BTRFS_ROOT_DEAD_TREE, }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f0cdb53f3e2d..6fe9197f6ee4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2958,7 +2958,7 @@ int open_ctree(struct super_block *sb, sb->s_bdi->congested_fn = btrfs_congested_fn; sb->s_bdi->congested_data = fs_info; sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 994f0cc41799..c5880329ae37 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6174,7 +6174,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, * * This is overestimating in most cases. */ - qgroup_rsv_size = outstanding_extents * fs_info->nodesize; + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; spin_lock(&block_rsv->lock); block_rsv->size = reserve_size; @@ -8764,6 +8764,8 @@ struct walk_control { u64 refs[BTRFS_MAX_LEVEL]; u64 flags[BTRFS_MAX_LEVEL]; struct btrfs_key update_progress; + struct btrfs_key drop_progress; + int drop_level; int stage; int level; int shared_level; @@ -8771,6 +8773,7 @@ struct walk_control { int keep_locks; int reada_slot; int reada_count; + int restarted; }; #define DROP_REFERENCE 1 @@ -8934,6 +8937,33 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, } /* + * This is used to verify a ref exists for this root to deal with a bug where we + * would have a drop_progress key that hadn't been updated properly. + */ +static int check_ref_exists(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 parent, + int level) +{ + struct btrfs_path *path; + struct btrfs_extent_inline_ref *iref; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = lookup_extent_backref(trans, path, &iref, bytenr, + root->fs_info->nodesize, parent, + root->root_key.objectid, level, 0); + btrfs_free_path(path); + if (ret == -ENOENT) + return 0; + if (ret < 0) + return ret; + return 1; +} + +/* * helper to process tree block pointer. * * when wc->stage == DROP_REFERENCE, this function checks @@ -9088,6 +9118,23 @@ skip: } /* + * If we had a drop_progress we need to verify the refs are set + * as expected. If we find our ref then we know that from here + * on out everything should be correct, and we can clear the + * ->restarted flag. + */ + if (wc->restarted) { + ret = check_ref_exists(trans, root, bytenr, parent, + level - 1); + if (ret < 0) + goto out_unlock; + if (ret == 0) + goto no_delete; + ret = 0; + wc->restarted = 0; + } + + /* * Reloc tree doesn't contribute to qgroup numbers, and we have * already accounted them at merge time (replace_path), * thus we could skip expensive subtree trace here. @@ -9102,13 +9149,23 @@ skip: ret); } } + + /* + * We need to update the next key in our walk control so we can + * update the drop_progress key accordingly. We don't care if + * find_next_key doesn't find a key because that means we're at + * the end and are going to clean up now. + */ + wc->drop_level = level; + find_next_key(path, level, &wc->drop_progress); + ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, parent, root->root_key.objectid, level - 1, 0); if (ret) goto out_unlock; } - +no_delete: *lookup_info = 1; ret = 1; @@ -9425,6 +9482,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } } + wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); wc->level = level; wc->shared_level = -1; wc->stage = DROP_REFERENCE; @@ -9452,12 +9510,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (wc->stage == DROP_REFERENCE) { - level = wc->level; - btrfs_node_key(path->nodes[level], - &root_item->drop_progress, - path->slots[level]); - root_item->drop_level = level; - } + wc->drop_level = wc->level; + btrfs_node_key_to_cpu(path->nodes[wc->drop_level], + &wc->drop_progress, + path->slots[wc->drop_level]); + } + btrfs_cpu_key_to_disk(&root_item->drop_progress, + &wc->drop_progress); + root_item->drop_level = wc->drop_level; BUG_ON(wc->level == 0); if (btrfs_should_end_transaction(trans) || diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ab705183d749..ca8b8e785cf3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2995,11 +2995,11 @@ static int __do_readpage(struct extent_io_tree *tree, */ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->orig_start) + *prev_em_start != em->start) force_bio_submit = true; if (prev_em_start) - *prev_em_start = em->orig_start; + *prev_em_start = em->start; free_extent_map(em); em = NULL; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 920bf3b4b0ef..cccc75d15970 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/highmem.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -427,9 +428,13 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, unsigned long this_sum_bytes = 0; int i; u64 offset; + unsigned nofs_flag; + + nofs_flag = memalloc_nofs_save(); + sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), + GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); - sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), - GFP_NOFS); if (!sums) return BLK_STS_RESOURCE; @@ -472,8 +477,10 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, bytes_left = bio->bi_iter.bi_size - total_bytes; - sums = kzalloc(btrfs_ordered_sum_size(fs_info, bytes_left), - GFP_NOFS); + nofs_flag = memalloc_nofs_save(); + sums = kvzalloc(btrfs_ordered_sum_size(fs_info, + bytes_left), GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); BUG_ON(!sums); /* -ENOMEM */ sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 82fdda8ff5ab..2973608824ec 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6783,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); - u8 extent_type; + int extent_type = -1; struct btrfs_path *path = NULL; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 494f0f10d70e..cd4e693406a0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -501,6 +501,16 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* + * If the fs is mounted with nologreplay, which requires it to be + * mounted in RO mode as well, we can not allow discard on free space + * inside block groups, because log trees refer to extents that are not + * pinned in a block group's free space cache (pinning the extents is + * precisely the first phase of replaying a log tree). + */ + if (btrfs_test_opt(fs_info, NOLOGREPLAY)) + return -EROFS; + rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { @@ -3207,21 +3217,6 @@ out: return ret; } -static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) -{ - inode_unlock(inode1); - inode_unlock(inode2); -} - -static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) -{ - if (inode1 < inode2) - swap(inode1, inode2); - - inode_lock_nested(inode1, I_MUTEX_PARENT); - inode_lock_nested(inode2, I_MUTEX_CHILD); -} - static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, struct inode *inode2, u64 loff2, u64 len) { @@ -3956,7 +3951,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (same_inode) inode_lock(inode_in); else - btrfs_double_inode_lock(inode_in, inode_out); + lock_two_nondirectories(inode_in, inode_out); /* don't make the dst file partly checksummed */ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != @@ -4013,7 +4008,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (same_inode) inode_unlock(inode_in); else - btrfs_double_inode_unlock(inode_in, inode_out); + unlock_two_nondirectories(inode_in, inode_out); return ret; } @@ -4043,7 +4038,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, if (same_inode) inode_unlock(src_inode); else - btrfs_double_inode_unlock(src_inode, dst_inode); + unlock_two_nondirectories(src_inode, dst_inode); return ret < 0 ? ret : len; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6fde2b2741ef..45e3cfd1198b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/writeback.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "transaction.h" #include "btrfs_inode.h" @@ -442,7 +443,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); list_del(&sum->list); - kfree(sum); + kvfree(sum); } kmem_cache_free(btrfs_ordered_extent_cache, entry); } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index dc6140013ae8..61d22a56c0ba 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -366,11 +366,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, static int prop_compression_validate(const char *value, size_t len) { - if (!strncmp("lzo", value, len)) + if (!strncmp("lzo", value, 3)) return 0; - else if (!strncmp("zlib", value, len)) + else if (!strncmp("zlib", value, 4)) return 0; - else if (!strncmp("zstd", value, len)) + else if (!strncmp("zstd", value, 4)) return 0; return -EINVAL; @@ -396,7 +396,7 @@ static int prop_compression_apply(struct inode *inode, btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); } else if (!strncmp("zlib", value, 4)) { type = BTRFS_COMPRESS_ZLIB; - } else if (!strncmp("zstd", value, len)) { + } else if (!strncmp("zstd", value, 4)) { type = BTRFS_COMPRESS_ZSTD; btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } else { diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c1cd5558a646..e659d9d61107 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -894,6 +894,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) if (fs_info->quota_root) goto out; + fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + /* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item @@ -909,13 +915,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) goto out; } - fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); - if (!fs_info->qgroup_ulist) { - ret = -ENOMEM; - btrfs_abort_transaction(trans, ret); - goto out; - } - /* * initially create the quota tree */ @@ -1923,8 +1922,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, int i; /* Level sanity check */ - if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL || - root_level < 0 || root_level >= BTRFS_MAX_LEVEL || + if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || root_level < cur_level) { btrfs_err_rl(fs_info, "%s: bad levels, cur_level=%d root_level=%d", diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 1869ba8e5981..67a6f7d47402 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2430,8 +2430,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, bitmap_clear(rbio->dbitmap, pagenr, 1); kunmap(p); - for (stripe = 0; stripe < rbio->real_stripes; stripe++) + for (stripe = 0; stripe < nr_data; stripe++) kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + kunmap(p_page); } __free_page(p_page); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index d09b6cdb785a..b283d3a6e837 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -205,28 +205,17 @@ static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) #ifdef CONFIG_STACKTRACE static void __save_stack_trace(struct ref_action *ra) { - struct stack_trace stack_trace; - - stack_trace.max_entries = MAX_TRACE; - stack_trace.nr_entries = 0; - stack_trace.entries = ra->trace; - stack_trace.skip = 2; - save_stack_trace(&stack_trace); - ra->trace_len = stack_trace.nr_entries; + ra->trace_len = stack_trace_save(ra->trace, MAX_TRACE, 2); } static void __print_stack_trace(struct btrfs_fs_info *fs_info, struct ref_action *ra) { - struct stack_trace trace; - if (ra->trace_len == 0) { btrfs_err(fs_info, " ref-verify: no stacktrace"); return; } - trace.nr_entries = ra->trace_len; - trace.entries = ra->trace; - print_stack_trace(&trace, 2); + stack_trace_print(ra->trace, ra->trace_len, 2); } #else static void inline __save_stack_trace(struct ref_action *ra) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 0d2b957ca3a3..893d12fbfda0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -263,8 +263,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) if (root) { WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); - if (btrfs_root_refs(&root->root_item) == 0) + if (btrfs_root_refs(&root->root_item) == 0) { + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); + } continue; } @@ -310,8 +312,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) break; } - if (btrfs_root_refs(&root->root_item) == 0) + if (btrfs_root_refs(&root->root_item) == 0) { + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); + } } btrfs_free_path(path); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index acdad6d658f5..e4e665f422fc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1886,8 +1886,10 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) } } -static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) +static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; + /* * We use writeback_inodes_sb here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. @@ -1897,15 +1899,50 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. */ - if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) { writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); + } else { + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + + /* + * Flush dellaloc for any root that is going to be snapshotted. + * This is done to avoid a corrupted version of files, in the + * snapshots, that had both buffered and direct IO writes (even + * if they were done sequentially) due to an unordered update of + * the inode's size on disk. + */ + list_for_each_entry(pending, head, list) { + int ret; + + ret = btrfs_start_delalloc_snapshot(pending->root); + if (ret) + return ret; + } + } return 0; } -static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) +static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans) { - if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) + struct btrfs_fs_info *fs_info = trans->fs_info; + + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) { btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + } else { + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + + /* + * Wait for any dellaloc that we started previously for the roots + * that are going to be snapshotted. This is to avoid a corrupted + * version of files in the snapshots that had both buffered and + * direct IO writes (even if they were done sequentially). + */ + list_for_each_entry(pending, head, list) + btrfs_wait_ordered_extents(pending->root, + U64_MAX, 0, U64_MAX); + } } int btrfs_commit_transaction(struct btrfs_trans_handle *trans) @@ -2023,7 +2060,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) extwriter_counter_dec(cur_trans, trans->type); - ret = btrfs_start_delalloc_flush(fs_info); + ret = btrfs_start_delalloc_flush(trans); if (ret) goto cleanup_transaction; @@ -2039,7 +2076,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto cleanup_transaction; - btrfs_wait_delalloc_flush(fs_info); + btrfs_wait_delalloc_flush(trans); btrfs_scrub_pause(fs_info); /* diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f06454a55e00..561884f60d35 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3578,9 +3578,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* find the first key from this transaction again */ + /* + * Find the first key from this transaction again. See the note for + * log_new_dir_dentries, if we're logging a directory recursively we + * won't be holding its i_mutex, which means we can modify the directory + * while we're logging it. If we remove an entry between our first + * search and this search we'll not find the key again and can just + * bail. + */ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (WARN_ON(ret != 0)) + if (ret != 0) goto done; /* @@ -4544,6 +4551,19 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); *size_ret = btrfs_inode_size(path->nodes[0], item); + /* + * If the in-memory inode's i_size is smaller then the inode + * size stored in the btree, return the inode's i_size, so + * that we get a correct inode size after replaying the log + * when before a power failure we had a shrinking truncate + * followed by addition of a new name (rename / new hard link). + * Otherwise return the inode size from the btree, to avoid + * data loss when replaying a log due to previously doing a + * write that expands the inode's size and logging a new name + * immediately after. + */ + if (*size_ret > inode->vfs_inode.i_size) + *size_ret = inode->vfs_inode.i_size; } btrfs_release_path(path); @@ -4705,15 +4725,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(leaf, extent); - ASSERT(len == i_size || - (len == fs_info->sectorsize && - btrfs_file_extent_compression(leaf, extent) != - BTRFS_COMPRESS_NONE) || - (len < i_size && i_size < fs_info->sectorsize)); + BTRFS_FILE_EXTENT_INLINE) return 0; - } len = btrfs_file_extent_num_bytes(leaf, extent); /* Last extent goes beyond i_size, no need to log a hole. */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9024eee889b9..db934ceae9c1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6407,7 +6407,7 @@ static void btrfs_end_bio(struct bio *bio) if (bio_op(bio) == REQ_OP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - else + else if (!(bio->bi_opf & REQ_RAHEAD)) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 3e418a3aeb11..6b9e29d050f3 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -195,8 +195,7 @@ static void zstd_cleanup_workspace_manager(void) struct workspace *workspace; int i; - del_timer(&wsm.timer); - + spin_lock(&wsm.lock); for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { while (!list_empty(&wsm.idle_ws[i])) { workspace = container_of(wsm.idle_ws[i].next, @@ -206,6 +205,9 @@ static void zstd_cleanup_workspace_manager(void) wsm.ops->free_workspace(&workspace->list); } } + spin_unlock(&wsm.lock); + + del_timer_sync(&wsm.timer); } /* diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bba28a5034ba..36a8dc699448 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -148,11 +148,17 @@ void ceph_caps_finalize(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->caps_list_lock); } -void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) +void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, + struct ceph_mount_options *fsopt) { spin_lock(&mdsc->caps_list_lock); - mdsc->caps_min_count += delta; - BUG_ON(mdsc->caps_min_count < 0); + mdsc->caps_min_count = fsopt->max_readdir; + if (mdsc->caps_min_count < 1024) + mdsc->caps_min_count = 1024; + mdsc->caps_use_max = fsopt->caps_max; + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_max < mdsc->caps_min_count) + mdsc->caps_use_max = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } @@ -272,6 +278,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, if (!err) { BUG_ON(have + alloc != need); ctx->count = need; + ctx->used = 0; } spin_lock(&mdsc->caps_list_lock); @@ -295,13 +302,24 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, } void ceph_unreserve_caps(struct ceph_mds_client *mdsc, - struct ceph_cap_reservation *ctx) + struct ceph_cap_reservation *ctx) { + bool reclaim = false; + if (!ctx->count) + return; + dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); spin_lock(&mdsc->caps_list_lock); __ceph_unreserve_caps(mdsc, ctx->count); ctx->count = 0; + + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_count > mdsc->caps_use_max) + reclaim = true; spin_unlock(&mdsc->caps_list_lock); + + if (reclaim) + ceph_reclaim_caps_nr(mdsc, ctx->used); } struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, @@ -346,6 +364,7 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, BUG_ON(list_empty(&mdsc->caps_list)); ctx->count--; + ctx->used++; mdsc->caps_reserve_count--; mdsc->caps_use_count++; @@ -500,12 +519,12 @@ static void __insert_cap_node(struct ceph_inode_info *ci, static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - struct ceph_mount_options *ma = mdsc->fsc->mount_options; + struct ceph_mount_options *opt = mdsc->fsc->mount_options; ci->i_hold_caps_min = round_jiffies(jiffies + - ma->caps_wanted_delay_min * HZ); + opt->caps_wanted_delay_min * HZ); ci->i_hold_caps_max = round_jiffies(jiffies + - ma->caps_wanted_delay_max * HZ); + opt->caps_wanted_delay_max * HZ); dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); } @@ -657,6 +676,10 @@ void ceph_add_cap(struct inode *inode, session->s_nr_caps++; spin_unlock(&session->s_cap_lock); } else { + spin_lock(&session->s_cap_lock); + list_move_tail(&cap->session_caps, &session->s_caps); + spin_unlock(&session->s_cap_lock); + if (cap->cap_gen < session->s_cap_gen) cap->issued = cap->implemented = CEPH_CAP_PIN; @@ -1081,9 +1104,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { cap->queue_release = 1; if (removed) { - list_add_tail(&cap->session_caps, - &session->s_cap_releases); - session->s_num_cap_releases++; + __ceph_queue_cap_release(session, cap); removed = 0; } } else { @@ -1245,7 +1266,7 @@ static int send_cap_msg(struct cap_msg_args *arg) * Queue cap releases when an inode is dropped from our cache. Since * inode is about to be destroyed, there is no need for i_ceph_lock. */ -void ceph_queue_caps_release(struct inode *inode) +void __ceph_remove_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct rb_node *p; @@ -2393,6 +2414,12 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + /* encode_caps_cb() also will reset these sequence + * numbers. make sure sequence numbers in cap flush + * message match later reconnect message */ + cap->seq = 0; + cap->issue_seq = 0; + cap->mseq = 0; __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } else { @@ -3880,12 +3907,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, cap->seq = seq; cap->issue_seq = seq; spin_lock(&session->s_cap_lock); - list_add_tail(&cap->session_caps, - &session->s_cap_releases); - session->s_num_cap_releases++; + __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } - goto flush_cap_releases; + goto done; } /* these will work even if we don't have a cap yet */ @@ -3955,7 +3980,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_cap_op_name(op)); } - goto done; +done: + mutex_unlock(&session->s_mutex); +done_unlocked: + iput(inode); + ceph_put_string(extra_info.pool_ns); + return; flush_cap_releases: /* @@ -3963,14 +3993,8 @@ flush_cap_releases: * along for the mds (who clearly thinks we still have this * cap). */ - ceph_send_cap_releases(mdsc, session); - -done: - mutex_unlock(&session->s_mutex); -done_unlocked: - iput(inode); - ceph_put_string(extra_info.pool_ns); - return; + ceph_flush_cap_releases(mdsc, session); + goto done; bad: pr_err("ceph_handle_caps: corrupt message\n"); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index abdf98deeec4..98365e74cb4a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -139,23 +139,6 @@ static int caps_show(struct seq_file *s, void *p) return 0; } -static int dentry_lru_show(struct seq_file *s, void *ptr) -{ - struct ceph_fs_client *fsc = s->private; - struct ceph_mds_client *mdsc = fsc->mdsc; - struct ceph_dentry_info *di; - - spin_lock(&mdsc->dentry_lru_lock); - list_for_each_entry(di, &mdsc->dentry_lru, lru) { - struct dentry *dentry = di->dentry; - seq_printf(s, "%p %p\t%pd\n", - di, dentry, dentry); - } - spin_unlock(&mdsc->dentry_lru_lock); - - return 0; -} - static int mds_sessions_show(struct seq_file *s, void *ptr) { struct ceph_fs_client *fsc = s->private; @@ -195,7 +178,6 @@ static int mds_sessions_show(struct seq_file *s, void *ptr) CEPH_DEFINE_SHOW_FUNC(mdsmap_show) CEPH_DEFINE_SHOW_FUNC(mdsc_show) CEPH_DEFINE_SHOW_FUNC(caps_show) -CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) CEPH_DEFINE_SHOW_FUNC(mds_sessions_show) @@ -231,7 +213,6 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) debugfs_remove(fsc->debugfs_mds_sessions); debugfs_remove(fsc->debugfs_caps); debugfs_remove(fsc->debugfs_mdsc); - debugfs_remove(fsc->debugfs_dentry_lru); } int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) @@ -291,14 +272,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_caps) goto out; - fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", - 0400, - fsc->client->debugfs_dir, - fsc, - &dentry_lru_show_fops); - if (!fsc->debugfs_dentry_lru) - goto out; - return 0; out: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 82928cea0209..0637149fb9f9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -29,6 +29,9 @@ const struct dentry_operations ceph_dentry_ops; +static bool __dentry_lease_is_valid(struct ceph_dentry_info *di); +static int __dir_lease_try_check(const struct dentry *dentry); + /* * Initialize ceph dentry state. */ @@ -44,7 +47,7 @@ static int ceph_d_init(struct dentry *dentry) di->lease_session = NULL; di->time = jiffies; dentry->d_fsdata = di; - ceph_dentry_lru_add(dentry); + INIT_LIST_HEAD(&di->lease_list); return 0; } @@ -241,6 +244,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, goto out; } if (fpos_cmp(ctx->pos, di->offset) <= 0) { + __ceph_dentry_dir_lease_touch(di); emit_dentry = true; } spin_unlock(&dentry->d_lock); @@ -1125,13 +1129,277 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, } /* + * Move dentry to tail of mdsc->dentry_leases list when lease is updated. + * Leases at front of the list will expire first. (Assume all leases have + * similar duration) + * + * Called under dentry->d_lock. + */ +void __ceph_dentry_lease_touch(struct ceph_dentry_info *di) +{ + struct dentry *dn = di->dentry; + struct ceph_mds_client *mdsc; + + dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn); + + di->flags |= CEPH_DENTRY_LEASE_LIST; + if (di->flags & CEPH_DENTRY_SHRINK_LIST) { + di->flags |= CEPH_DENTRY_REFERENCED; + return; + } + + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + list_move_tail(&di->lease_list, &mdsc->dentry_leases); + spin_unlock(&mdsc->dentry_list_lock); +} + +static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc, + struct ceph_dentry_info *di) +{ + di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED); + di->lease_gen = 0; + di->time = jiffies; + list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases); +} + +/* + * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases + * list if it's not in the list, otherwise set 'referenced' flag. + * + * Called under dentry->d_lock. + */ +void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) +{ + struct dentry *dn = di->dentry; + struct ceph_mds_client *mdsc; + + dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n", + di, dn, dn, di->offset); + + if (!list_empty(&di->lease_list)) { + if (di->flags & CEPH_DENTRY_LEASE_LIST) { + /* don't remove dentry from dentry lease list + * if its lease is valid */ + if (__dentry_lease_is_valid(di)) + return; + } else { + di->flags |= CEPH_DENTRY_REFERENCED; + return; + } + } + + if (di->flags & CEPH_DENTRY_SHRINK_LIST) { + di->flags |= CEPH_DENTRY_REFERENCED; + di->flags &= ~CEPH_DENTRY_LEASE_LIST; + return; + } + + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + __dentry_dir_lease_touch(mdsc, di), + spin_unlock(&mdsc->dentry_list_lock); +} + +static void __dentry_lease_unlist(struct ceph_dentry_info *di) +{ + struct ceph_mds_client *mdsc; + if (di->flags & CEPH_DENTRY_SHRINK_LIST) + return; + if (list_empty(&di->lease_list)) + return; + + mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + list_del_init(&di->lease_list); + spin_unlock(&mdsc->dentry_list_lock); +} + +enum { + KEEP = 0, + DELETE = 1, + TOUCH = 2, + STOP = 4, +}; + +struct ceph_lease_walk_control { + bool dir_lease; + bool expire_dir_lease; + unsigned long nr_to_scan; + unsigned long dir_lease_ttl; +}; + +static unsigned long +__dentry_leases_walk(struct ceph_mds_client *mdsc, + struct ceph_lease_walk_control *lwc, + int (*check)(struct dentry*, void*)) +{ + struct ceph_dentry_info *di, *tmp; + struct dentry *dentry, *last = NULL; + struct list_head* list; + LIST_HEAD(dispose); + unsigned long freed = 0; + int ret = 0; + + list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases; + spin_lock(&mdsc->dentry_list_lock); + list_for_each_entry_safe(di, tmp, list, lease_list) { + if (!lwc->nr_to_scan) + break; + --lwc->nr_to_scan; + + dentry = di->dentry; + if (last == dentry) + break; + + if (!spin_trylock(&dentry->d_lock)) + continue; + + if (dentry->d_lockref.count < 0) { + list_del_init(&di->lease_list); + goto next; + } + + ret = check(dentry, lwc); + if (ret & TOUCH) { + /* move it into tail of dir lease list */ + __dentry_dir_lease_touch(mdsc, di); + if (!last) + last = dentry; + } + if (ret & DELETE) { + /* stale lease */ + di->flags &= ~CEPH_DENTRY_REFERENCED; + if (dentry->d_lockref.count > 0) { + /* update_dentry_lease() will re-add + * it to lease list, or + * ceph_d_delete() will return 1 when + * last reference is dropped */ + list_del_init(&di->lease_list); + } else { + di->flags |= CEPH_DENTRY_SHRINK_LIST; + list_move_tail(&di->lease_list, &dispose); + dget_dlock(dentry); + } + } +next: + spin_unlock(&dentry->d_lock); + if (ret & STOP) + break; + } + spin_unlock(&mdsc->dentry_list_lock); + + while (!list_empty(&dispose)) { + di = list_first_entry(&dispose, struct ceph_dentry_info, + lease_list); + dentry = di->dentry; + spin_lock(&dentry->d_lock); + + list_del_init(&di->lease_list); + di->flags &= ~CEPH_DENTRY_SHRINK_LIST; + if (di->flags & CEPH_DENTRY_REFERENCED) { + spin_lock(&mdsc->dentry_list_lock); + if (di->flags & CEPH_DENTRY_LEASE_LIST) { + list_add_tail(&di->lease_list, + &mdsc->dentry_leases); + } else { + __dentry_dir_lease_touch(mdsc, di); + } + spin_unlock(&mdsc->dentry_list_lock); + } else { + freed++; + } + + spin_unlock(&dentry->d_lock); + /* ceph_d_delete() does the trick */ + dput(dentry); + } + return freed; +} + +static int __dentry_lease_check(struct dentry *dentry, void *arg) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + int ret; + + if (__dentry_lease_is_valid(di)) + return STOP; + ret = __dir_lease_try_check(dentry); + if (ret == -EBUSY) + return KEEP; + if (ret > 0) + return TOUCH; + return DELETE; +} + +static int __dir_lease_check(struct dentry *dentry, void *arg) +{ + struct ceph_lease_walk_control *lwc = arg; + struct ceph_dentry_info *di = ceph_dentry(dentry); + + int ret = __dir_lease_try_check(dentry); + if (ret == -EBUSY) + return KEEP; + if (ret > 0) { + if (time_before(jiffies, di->time + lwc->dir_lease_ttl)) + return STOP; + /* Move dentry to tail of dir lease list if we don't want + * to delete it. So dentries in the list are checked in a + * round robin manner */ + if (!lwc->expire_dir_lease) + return TOUCH; + if (dentry->d_lockref.count > 0 || + (di->flags & CEPH_DENTRY_REFERENCED)) + return TOUCH; + /* invalidate dir lease */ + di->lease_shared_gen = 0; + } + return DELETE; +} + +int ceph_trim_dentries(struct ceph_mds_client *mdsc) +{ + struct ceph_lease_walk_control lwc; + unsigned long count; + unsigned long freed; + + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_count > mdsc->caps_use_max) + count = mdsc->caps_use_count - mdsc->caps_use_max; + else + count = 0; + spin_unlock(&mdsc->caps_list_lock); + + lwc.dir_lease = false; + lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2; + freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check); + if (!lwc.nr_to_scan) /* more invalid leases */ + return -EAGAIN; + + if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE) + lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE; + + lwc.dir_lease = true; + lwc.expire_dir_lease = freed < count; + lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ; + freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check); + if (!lwc.nr_to_scan) /* more to check */ + return -EAGAIN; + + return freed > 0 ? 1 : 0; +} + +/* * Ensure a dentry lease will no longer revalidate. */ void ceph_invalidate_dentry_lease(struct dentry *dentry) { + struct ceph_dentry_info *di = ceph_dentry(dentry); spin_lock(&dentry->d_lock); - ceph_dentry(dentry)->time = jiffies; - ceph_dentry(dentry)->lease_shared_gen = 0; + di->time = jiffies; + di->lease_shared_gen = 0; + __dentry_lease_unlist(di); spin_unlock(&dentry->d_lock); } @@ -1139,45 +1407,59 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry) * Check if dentry lease is valid. If not, delete the lease. Try to * renew if the least is more than half up. */ +static bool __dentry_lease_is_valid(struct ceph_dentry_info *di) +{ + struct ceph_mds_session *session; + + if (!di->lease_gen) + return false; + + session = di->lease_session; + if (session) { + u32 gen; + unsigned long ttl; + + spin_lock(&session->s_gen_ttl_lock); + gen = session->s_cap_gen; + ttl = session->s_cap_ttl; + spin_unlock(&session->s_gen_ttl_lock); + + if (di->lease_gen == gen && + time_before(jiffies, ttl) && + time_before(jiffies, di->time)) + return true; + } + di->lease_gen = 0; + return false; +} + static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, struct inode *dir) { struct ceph_dentry_info *di; - struct ceph_mds_session *s; - int valid = 0; - u32 gen; - unsigned long ttl; struct ceph_mds_session *session = NULL; u32 seq = 0; + int valid = 0; spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); - if (di && di->lease_session) { - s = di->lease_session; - spin_lock(&s->s_gen_ttl_lock); - gen = s->s_cap_gen; - ttl = s->s_cap_ttl; - spin_unlock(&s->s_gen_ttl_lock); + if (di && __dentry_lease_is_valid(di)) { + valid = 1; - if (di->lease_gen == gen && - time_before(jiffies, di->time) && - time_before(jiffies, ttl)) { - valid = 1; - if (di->lease_renew_after && - time_after(jiffies, di->lease_renew_after)) { - /* - * We should renew. If we're in RCU walk mode - * though, we can't do that so just return - * -ECHILD. - */ - if (flags & LOOKUP_RCU) { - valid = -ECHILD; - } else { - session = ceph_get_mds_session(s); - seq = di->lease_seq; - di->lease_renew_after = 0; - di->lease_renew_from = jiffies; - } + if (di->lease_renew_after && + time_after(jiffies, di->lease_renew_after)) { + /* + * We should renew. If we're in RCU walk mode + * though, we can't do that so just return + * -ECHILD. + */ + if (flags & LOOKUP_RCU) { + valid = -ECHILD; + } else { + session = ceph_get_mds_session(di->lease_session); + seq = di->lease_seq; + di->lease_renew_after = 0; + di->lease_renew_from = jiffies; } } } @@ -1193,6 +1475,38 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, } /* + * Called under dentry->d_lock. + */ +static int __dir_lease_try_check(const struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + struct inode *dir; + struct ceph_inode_info *ci; + int valid = 0; + + if (!di->lease_shared_gen) + return 0; + if (IS_ROOT(dentry)) + return 0; + + dir = d_inode(dentry->d_parent); + ci = ceph_inode(dir); + + if (spin_trylock(&ci->i_ceph_lock)) { + if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen && + __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0)) + valid = 1; + spin_unlock(&ci->i_ceph_lock); + } else { + valid = -EBUSY; + } + + if (!valid) + di->lease_shared_gen = 0; + return valid; +} + +/* * Check if directory-wide content lease/cap is valid. */ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) @@ -1205,6 +1519,8 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen) valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); spin_unlock(&ci->i_ceph_lock); + if (valid) + __ceph_dentry_dir_lease_touch(di); dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, (unsigned)di->lease_shared_gen, valid); @@ -1297,11 +1613,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) } dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); - if (valid) { - ceph_dentry_lru_touch(dentry); - } else { + if (!valid) ceph_dir_clear_complete(dir); - } if (!(flags & LOOKUP_RCU)) dput(parent); @@ -1309,6 +1622,31 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) } /* + * Delete unused dentry that doesn't have valid lease + * + * Called under dentry->d_lock. + */ +static int ceph_d_delete(const struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + /* won't release caps */ + if (d_really_is_negative(dentry)) + return 0; + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) + return 0; + /* vaild lease? */ + di = ceph_dentry(dentry); + if (di) { + if (__dentry_lease_is_valid(di)) + return 0; + if (__dir_lease_try_check(dentry)) + return 0; + } + return 1; +} + +/* * Release our ceph_dentry_info. */ static void ceph_d_release(struct dentry *dentry) @@ -1316,9 +1654,9 @@ static void ceph_d_release(struct dentry *dentry) struct ceph_dentry_info *di = ceph_dentry(dentry); dout("d_release %p\n", dentry); - ceph_dentry_lru_del(dentry); spin_lock(&dentry->d_lock); + __dentry_lease_unlist(di); dentry->d_fsdata = NULL; spin_unlock(&dentry->d_lock); @@ -1419,49 +1757,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, return size - left; } -/* - * We maintain a private dentry LRU. - * - * FIXME: this needs to be changed to a per-mds lru to be useful. - */ -void ceph_dentry_lru_add(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_add_tail(&di->lru, &mdsc->dentry_lru); - mdsc->num_dentry++; - spin_unlock(&mdsc->dentry_lru_lock); -} - -void ceph_dentry_lru_touch(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn, - di->offset); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_move_tail(&di->lru, &mdsc->dentry_lru); - spin_unlock(&mdsc->dentry_lru_lock); -} -void ceph_dentry_lru_del(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_del_init(&di->lru); - mdsc->num_dentry--; - spin_unlock(&mdsc->dentry_lru_lock); -} /* * Return name hash for a given dentry. This is dependent on @@ -1470,6 +1766,7 @@ void ceph_dentry_lru_del(struct dentry *dn) unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) { struct ceph_inode_info *dci = ceph_inode(dir); + unsigned hash; switch (dci->i_dir_layout.dl_dir_hash) { case 0: /* for backward compat */ @@ -1477,8 +1774,11 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) return dn->d_name.hash; default: - return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + spin_lock(&dn->d_lock); + hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash, dn->d_name.name, dn->d_name.len); + spin_unlock(&dn->d_lock); + return hash; } } @@ -1531,6 +1831,7 @@ const struct inode_operations ceph_snapdir_iops = { const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, + .d_delete = ceph_d_delete, .d_release = ceph_d_release, .d_prune = ceph_d_prune, .d_init = ceph_d_init, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 189df668b6a0..9f53c3d99304 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -590,7 +590,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, * but it will at least behave sensibly when they are * in sequence. */ - ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len); + ret = filemap_write_and_wait_range(inode->i_mapping, + off, off + len - 1); if (ret < 0) return ret; @@ -929,14 +930,15 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (write ? "write" : "read"), file, pos, (unsigned)count, snapc, snapc->seq); - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + ret = filemap_write_and_wait_range(inode->i_mapping, + pos, pos + count - 1); if (ret < 0) return ret; if (write) { int ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, - (pos + count) >> PAGE_SHIFT); + (pos + count - 1) >> PAGE_SHIFT); if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret2); @@ -1132,13 +1134,14 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", file, pos, (unsigned)count, snapc, snapc->seq); - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + ret = filemap_write_and_wait_range(inode->i_mapping, + pos, pos + count - 1); if (ret < 0) return ret; ret = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, - (pos + count) >> PAGE_SHIFT); + (pos + count - 1) >> PAGE_SHIFT); if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 9d1f34d46627..c2feb310ac1e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -497,7 +497,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; atomic_set(&ci->i_filelock_ref, 0); - atomic_set(&ci->i_shared_gen, 0); + atomic_set(&ci->i_shared_gen, 1); ci->i_rdcache_gen = 0; ci->i_rdcache_revoking = 0; @@ -524,6 +524,7 @@ static void ceph_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct ceph_inode_info *ci = ceph_inode(inode); + kfree(ci->i_symlink); kmem_cache_free(ceph_inode_cachep, ci); } @@ -537,7 +538,7 @@ void ceph_destroy_inode(struct inode *inode) ceph_fscache_unregister_inode_cookie(ci); - ceph_queue_caps_release(inode); + __ceph_remove_caps(inode); if (__ceph_has_any_quota(ci)) ceph_adjust_quota_realms_count(inode, false); @@ -548,20 +549,24 @@ void ceph_destroy_inode(struct inode *inode) */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; - struct ceph_snap_realm *realm = ci->i_snap_realm; - - dout(" dropping residual ref to snap realm %p\n", realm); - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm = NULL; - if (realm->ino == ci->i_vino.ino) - realm->inode = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); + ceph_inode_to_client(inode)->mdsc; + if (ceph_snap(inode) == CEPH_NOSNAP) { + struct ceph_snap_realm *realm = ci->i_snap_realm; + dout(" dropping residual ref to snap realm %p\n", + realm); + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm = NULL; + if (realm->ino == ci->i_vino.ino) + realm->inode = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, realm); + } else { + ceph_put_snapid_map(mdsc, ci->i_snapid_map); + ci->i_snap_realm = NULL; + } } - kfree(ci->i_symlink); while ((n = rb_first(&ci->i_fragtree)) != NULL) { frag = rb_entry(n, struct ceph_inode_frag, node); rb_erase(n, &ci->i_fragtree); @@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, iinfo->pool_ns_len); + if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map) + ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode)); + spin_lock(&ci->i_ceph_lock); /* @@ -869,6 +877,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ci->i_rbytes = le64_to_cpu(info->rbytes); ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); + ci->i_dir_pin = iinfo->dir_pin; ceph_decode_timespec64(&ci->i_rctime, &info->rctime); } } @@ -899,6 +908,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, case S_IFBLK: case S_IFCHR: case S_IFSOCK: + inode->i_blkbits = PAGE_SHIFT; init_special_inode(inode, inode->i_mode, inode->i_rdev); inode->i_op = &ceph_file_iops; break; @@ -1066,9 +1076,10 @@ static void update_dentry_lease(struct dentry *dentry, goto out_unlock; di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); - - if (duration == 0) + if (duration == 0) { + __ceph_dentry_dir_lease_touch(di); goto out_unlock; + } if (di->lease_gen == session->s_cap_gen && time_before(ttl, di->time)) @@ -1079,8 +1090,6 @@ static void update_dentry_lease(struct dentry *dentry, di->lease_session = NULL; } - ceph_dentry_lru_touch(dentry); - if (!di->lease_session) di->lease_session = ceph_get_mds_session(session); di->lease_gen = session->s_cap_gen; @@ -1088,6 +1097,8 @@ static void update_dentry_lease(struct dentry *dentry, di->lease_renew_after = half_ttl; di->lease_renew_from = 0; di->time = ttl; + + __ceph_dentry_lease_touch(di); out_unlock: spin_unlock(&dentry->d_lock); if (old_lease_session) @@ -1152,6 +1163,19 @@ static int splice_dentry(struct dentry **pdn, struct inode *in) return 0; } +static int d_name_cmp(struct dentry *dentry, const char *name, size_t len) +{ + int ret; + + /* take d_lock to ensure dentry->d_name stability */ + spin_lock(&dentry->d_lock); + ret = dentry->d_name.len - len; + if (!ret) + ret = memcmp(dentry->d_name.name, name, len); + spin_unlock(&dentry->d_lock); + return ret; +} + /* * Incorporate results into the local cache. This is either just * one inode, or a directory, dentry, and possibly linked-to inode (e.g., @@ -1401,7 +1425,8 @@ retry_lookup: err = splice_dentry(&req->r_dentry, in); if (err < 0) goto done; - } else if (rinfo->head->is_dentry) { + } else if (rinfo->head->is_dentry && + !d_name_cmp(req->r_dentry, rinfo->dname, rinfo->dname_len)) { struct ceph_vino *ptvino = NULL; if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) || @@ -2259,10 +2284,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat, if (!err) { generic_fillattr(inode, stat); stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); - if (ceph_snap(inode) != CEPH_NOSNAP) - stat->dev = ceph_snap(inode); + if (ceph_snap(inode) == CEPH_NOSNAP) + stat->dev = inode->i_sb->s_dev; else - stat->dev = 0; + stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; + if (S_ISDIR(inode->i_mode)) { if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 163fc74bf221..9049c2a3e972 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -20,6 +20,8 @@ #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> +#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) + /* * A cluster of MDS (metadata server) daemons is responsible for * managing the file system namespace (the directory hierarchy and @@ -46,13 +48,17 @@ */ struct ceph_reconnect_state { - int nr_caps; + struct ceph_mds_session *session; + int nr_caps, nr_realms; struct ceph_pagelist *pagelist; unsigned msg_version; + bool allow_multi; }; static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); +static void ceph_cap_release_work(struct work_struct *work); +static void ceph_cap_reclaim_work(struct work_struct *work); static const struct ceph_connection_operations mds_con_ops; @@ -61,6 +67,29 @@ static const struct ceph_connection_operations mds_con_ops; * mds reply parsing */ +static int parse_reply_info_quota(void **p, void *end, + struct ceph_mds_reply_info_in *info) +{ + u8 struct_v, struct_compat; + u32 struct_len; + + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only + * understand encoding with struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + ceph_decode_64_safe(p, end, info->max_bytes, bad); + ceph_decode_64_safe(p, end, info->max_files, bad); + *p = end; + return 0; +bad: + return -EIO; +} + /* * parse individual inode info */ @@ -68,8 +97,24 @@ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, u64 features) { - int err = -EIO; + int err = 0; + u8 struct_v = 0; + if (features == (u64)-1) { + u32 struct_len; + u8 struct_compat; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand + * encoding with struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + } + + ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad); info->in = *p; *p += sizeof(struct ceph_mds_reply_inode) + sizeof(*info->in->fragtree.splits) * @@ -87,49 +132,136 @@ static int parse_reply_info_in(void **p, void *end, info->xattr_data = *p; *p += info->xattr_len; - if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + if (features == (u64)-1) { + /* inline data */ ceph_decode_64_safe(p, end, info->inline_version, bad); ceph_decode_32_safe(p, end, info->inline_len, bad); ceph_decode_need(p, end, info->inline_len, bad); info->inline_data = *p; *p += info->inline_len; - } else - info->inline_version = CEPH_INLINE_NONE; + /* quota */ + err = parse_reply_info_quota(p, end, info); + if (err < 0) + goto out_bad; + /* pool namespace */ + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } + /* btime, change_attr */ + { + struct ceph_timespec btime; + u64 change_attr; + ceph_decode_need(p, end, sizeof(btime), bad); + ceph_decode_copy(p, &btime, sizeof(btime)); + ceph_decode_64_safe(p, end, change_attr, bad); + } + + /* dir pin */ + if (struct_v >= 2) { + ceph_decode_32_safe(p, end, info->dir_pin, bad); + } else { + info->dir_pin = -ENODATA; + } + + *p = end; + } else { + if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + ceph_decode_64_safe(p, end, info->inline_version, bad); + ceph_decode_32_safe(p, end, info->inline_len, bad); + ceph_decode_need(p, end, info->inline_len, bad); + info->inline_data = *p; + *p += info->inline_len; + } else + info->inline_version = CEPH_INLINE_NONE; + + if (features & CEPH_FEATURE_MDS_QUOTA) { + err = parse_reply_info_quota(p, end, info); + if (err < 0) + goto out_bad; + } else { + info->max_bytes = 0; + info->max_files = 0; + } + + info->pool_ns_len = 0; + info->pool_ns_data = NULL; + if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } + } + + info->dir_pin = -ENODATA; + } + return 0; +bad: + err = -EIO; +out_bad: + return err; +} - if (features & CEPH_FEATURE_MDS_QUOTA) { +static int parse_reply_info_dir(void **p, void *end, + struct ceph_mds_reply_dirfrag **dirfrag, + u64 features) +{ + if (features == (u64)-1) { u8 struct_v, struct_compat; u32 struct_len; - - /* - * both struct_v and struct_compat are expected to be >= 1 - */ ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); - if (!struct_v || !struct_compat) + /* struct_v is expected to be >= 1. we only understand + * encoding whose struct_compat == 1. */ + if (!struct_v || struct_compat != 1) goto bad; ceph_decode_32_safe(p, end, struct_len, bad); ceph_decode_need(p, end, struct_len, bad); - ceph_decode_64_safe(p, end, info->max_bytes, bad); - ceph_decode_64_safe(p, end, info->max_files, bad); - } else { - info->max_bytes = 0; - info->max_files = 0; + end = *p + struct_len; } - info->pool_ns_len = 0; - info->pool_ns_data = NULL; - if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { - ceph_decode_32_safe(p, end, info->pool_ns_len, bad); - if (info->pool_ns_len > 0) { - ceph_decode_need(p, end, info->pool_ns_len, bad); - info->pool_ns_data = *p; - *p += info->pool_ns_len; - } + ceph_decode_need(p, end, sizeof(**dirfrag), bad); + *dirfrag = *p; + *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist); + if (unlikely(*p > end)) + goto bad; + if (features == (u64)-1) + *p = end; + return 0; +bad: + return -EIO; +} + +static int parse_reply_info_lease(void **p, void *end, + struct ceph_mds_reply_lease **lease, + u64 features) +{ + if (features == (u64)-1) { + u8 struct_v, struct_compat; + u32 struct_len; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand + * encoding whose struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; } + ceph_decode_need(p, end, sizeof(**lease), bad); + *lease = *p; + *p += sizeof(**lease); + if (features == (u64)-1) + *p = end; return 0; bad: - return err; + return -EIO; } /* @@ -147,20 +279,18 @@ static int parse_reply_info_trace(void **p, void *end, if (err < 0) goto out_bad; - if (unlikely(*p + sizeof(*info->dirfrag) > end)) - goto bad; - info->dirfrag = *p; - *p += sizeof(*info->dirfrag) + - sizeof(u32)*le32_to_cpu(info->dirfrag->ndist); - if (unlikely(*p > end)) - goto bad; + err = parse_reply_info_dir(p, end, &info->dirfrag, features); + if (err < 0) + goto out_bad; ceph_decode_32_safe(p, end, info->dname_len, bad); ceph_decode_need(p, end, info->dname_len, bad); info->dname = *p; *p += info->dname_len; - info->dlease = *p; - *p += sizeof(*info->dlease); + + err = parse_reply_info_lease(p, end, &info->dlease, features); + if (err < 0) + goto out_bad; } if (info->head->is_target) { @@ -183,20 +313,16 @@ out_bad: /* * parse readdir results */ -static int parse_reply_info_dir(void **p, void *end, +static int parse_reply_info_readdir(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { u32 num, i = 0; int err; - info->dir_dir = *p; - if (*p + sizeof(*info->dir_dir) > end) - goto bad; - *p += sizeof(*info->dir_dir) + - sizeof(u32)*le32_to_cpu(info->dir_dir->ndist); - if (*p > end) - goto bad; + err = parse_reply_info_dir(p, end, &info->dir_dir, features); + if (err < 0) + goto out_bad; ceph_decode_need(p, end, sizeof(num) + 2, bad); num = ceph_decode_32(p); @@ -222,15 +348,16 @@ static int parse_reply_info_dir(void **p, void *end, while (num) { struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; /* dentry */ - ceph_decode_need(p, end, sizeof(u32)*2, bad); - rde->name_len = ceph_decode_32(p); + ceph_decode_32_safe(p, end, rde->name_len, bad); ceph_decode_need(p, end, rde->name_len, bad); rde->name = *p; *p += rde->name_len; dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); - rde->lease = *p; - *p += sizeof(struct ceph_mds_reply_lease); + /* dentry lease */ + err = parse_reply_info_lease(p, end, &rde->lease, features); + if (err) + goto out_bad; /* inode */ err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) @@ -281,7 +408,8 @@ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { - if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { + if (features == (u64)-1 || + (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { if (*p == end) { info->has_create_ino = false; } else { @@ -310,7 +438,7 @@ static int parse_reply_info_extra(void **p, void *end, if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) - return parse_reply_info_dir(p, end, info, features); + return parse_reply_info_readdir(p, end, info, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features); else @@ -494,7 +622,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); spin_lock_init(&s->s_gen_ttl_lock); - s->s_cap_gen = 0; + s->s_cap_gen = 1; s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); @@ -510,6 +638,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); + INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); + INIT_LIST_HEAD(&s->s_cap_flushing); mdsc->sessions[mds] = s; @@ -535,6 +665,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc, dout("__unregister_session mds%d %p\n", s->s_mds, s); BUG_ON(mdsc->sessions[s->s_mds] != s); mdsc->sessions[s->s_mds] = NULL; + s->s_state = 0; ceph_con_close(&s->s_con); ceph_put_mds_session(s); atomic_dec(&mdsc->num_sessions); @@ -1197,13 +1328,10 @@ static int iterate_session_caps(struct ceph_mds_session *session, cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; - if (cap->queue_release) { - list_add_tail(&cap->session_caps, - &session->s_cap_releases); - session->s_num_cap_releases++; - } else { + if (cap->queue_release) + __ceph_queue_cap_release(session, cap); + else old_cap = cap; /* put_cap it w/o locks held */ - } } if (ret < 0) goto out; @@ -1286,6 +1414,15 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } + + if (drop && + ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -1638,7 +1775,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, session->s_trim_caps = 0; } - ceph_send_cap_releases(mdsc, session); + ceph_flush_cap_releases(mdsc, session); return 0; } @@ -1681,8 +1818,8 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, /* * called under s_mutex */ -void ceph_send_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) { struct ceph_msg *msg = NULL; struct ceph_mds_cap_release *head; @@ -1774,6 +1911,81 @@ out_err: spin_unlock(&session->s_cap_lock); } +static void ceph_cap_release_work(struct work_struct *work) +{ + struct ceph_mds_session *session = + container_of(work, struct ceph_mds_session, s_cap_release_work); + + mutex_lock(&session->s_mutex); + if (session->s_state == CEPH_MDS_SESSION_OPEN || + session->s_state == CEPH_MDS_SESSION_HUNG) + ceph_send_cap_releases(session->s_mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); +} + +void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + if (mdsc->stopping) + return; + + get_session(session); + if (queue_work(mdsc->fsc->cap_wq, + &session->s_cap_release_work)) { + dout("cap release work queued\n"); + } else { + ceph_put_mds_session(session); + dout("failed to queue cap release work\n"); + } +} + +/* + * caller holds session->s_cap_lock + */ +void __ceph_queue_cap_release(struct ceph_mds_session *session, + struct ceph_cap *cap) +{ + list_add_tail(&cap->session_caps, &session->s_cap_releases); + session->s_num_cap_releases++; + + if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE)) + ceph_flush_cap_releases(session->s_mdsc, session); +} + +static void ceph_cap_reclaim_work(struct work_struct *work) +{ + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, cap_reclaim_work); + int ret = ceph_trim_dentries(mdsc); + if (ret == -EAGAIN) + ceph_queue_cap_reclaim_work(mdsc); +} + +void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc) +{ + if (mdsc->stopping) + return; + + if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) { + dout("caps reclaim work queued\n"); + } else { + dout("failed to queue caps release work\n"); + } +} + +void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) +{ + int val; + if (!nr) + return; + val = atomic_add_return(nr, &mdsc->cap_reclaim_pending); + if (!(val % CEPH_CAPS_PER_RELEASE)) { + atomic_set(&mdsc->cap_reclaim_pending, 0); + ceph_queue_cap_reclaim_work(mdsc); + } +} + /* * requests */ @@ -1958,10 +2170,39 @@ retry: return path; } +/* Duplicate the dentry->d_name.name safely */ +static int clone_dentry_name(struct dentry *dentry, const char **ppath, + int *ppathlen) +{ + u32 len; + char *name; + +retry: + len = READ_ONCE(dentry->d_name.len); + name = kmalloc(len + 1, GFP_NOFS); + if (!name) + return -ENOMEM; + + spin_lock(&dentry->d_lock); + if (dentry->d_name.len != len) { + spin_unlock(&dentry->d_lock); + kfree(name); + goto retry; + } + memcpy(name, dentry->d_name.name, len); + spin_unlock(&dentry->d_lock); + + name[len] = '\0'; + *ppath = name; + *ppathlen = len; + return 0; +} + static int build_dentry_path(struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath, bool parent_locked) { + int ret; char *path; rcu_read_lock(); @@ -1970,8 +2211,15 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (dir && ceph_snap(dir) == CEPH_NOSNAP) { *pino = ceph_ino(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + if (parent_locked) { + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; + } else { + ret = clone_dentry_name(dentry, ppath, ppathlen); + if (ret) + return ret; + *pfreepath = true; + } return 0; } rcu_read_unlock(); @@ -1979,13 +2227,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } static int build_inode_path(struct inode *inode, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath) { struct dentry *dentry; char *path; @@ -2001,7 +2249,7 @@ static int build_inode_path(struct inode *inode, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } @@ -2012,7 +2260,7 @@ static int build_inode_path(struct inode *inode, static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, const char *rpath, u64 rino, const char **ppath, int *pathlen, - u64 *ino, int *freepath) + u64 *ino, bool *freepath, bool parent_locked) { int r = 0; @@ -2022,7 +2270,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, ceph_snap(rinode)); } else if (rdentry) { r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, - freepath); + freepath, parent_locked); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { @@ -2048,7 +2296,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; - int freepath1 = 0, freepath2 = 0; + bool freepath1 = false, freepath2 = false; int len; u16 releases; void *p, *end; @@ -2056,16 +2304,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ret = set_request_path_attr(req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1); + &path1, &pathlen1, &ino1, &freepath1, + test_bit(CEPH_MDS_R_PARENT_LOCKED, + &req->r_req_flags)); if (ret < 0) { msg = ERR_PTR(ret); goto out; } + /* If r_old_dentry is set, then assume that its parent is locked */ ret = set_request_path_attr(NULL, req->r_old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2); + &path2, &pathlen2, &ino2, &freepath2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; @@ -2653,7 +2904,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) dout("handle_reply tid %lld result %d\n", tid, result); rinfo = &req->r_reply_info; - err = parse_reply_info(msg, rinfo, session->s_con.peer_features); + if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) + err = parse_reply_info(msg, rinfo, (u64)-1); + else + err = parse_reply_info(msg, rinfo, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); @@ -2684,7 +2938,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || req->r_op == CEPH_MDS_OP_LSSNAP)) ceph_readdir_prepopulate(req, req->r_session); - ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } current->journal_info = NULL; mutex_unlock(&req->r_fill_mutex); @@ -2693,12 +2946,18 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (realm) ceph_put_snap_realm(mdsc, realm); - if (err == 0 && req->r_target_inode && - test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { - struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); - spin_unlock(&ci->i_unsafe_lock); + if (err == 0) { + if (req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + struct ceph_inode_info *ci = + ceph_inode(req->r_target_inode); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_target_item, + &ci->i_unsafe_iops); + spin_unlock(&ci->i_unsafe_lock); + } + + ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } out_err: mutex_lock(&mdsc->mutex); @@ -2777,6 +3036,25 @@ bad: pr_err("mdsc_handle_forward decode error err=%d\n", err); } +static int __decode_and_drop_session_metadata(void **p, void *end) +{ + /* map<string,string> */ + u32 n; + ceph_decode_32_safe(p, end, n, bad); + while (n-- > 0) { + u32 len; + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + *p += len; + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + *p += len; + } + return 0; +bad: + return -1; +} + /* * handle a mds session control message */ @@ -2784,18 +3062,36 @@ static void handle_session(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; + int mds = session->s_mds; + int msg_version = le16_to_cpu(msg->hdr.version); + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + struct ceph_mds_session_head *h; u32 op; u64 seq; - int mds = session->s_mds; - struct ceph_mds_session_head *h = msg->front.iov_base; + unsigned long features = 0; int wake = 0; /* decode */ - if (msg->front.iov_len < sizeof(*h)) - goto bad; + ceph_decode_need(&p, end, sizeof(*h), bad); + h = p; + p += sizeof(*h); + op = le32_to_cpu(h->op); seq = le64_to_cpu(h->seq); + if (msg_version >= 3) { + u32 len; + /* version >= 2, metadata */ + if (__decode_and_drop_session_metadata(&p, end) < 0) + goto bad; + /* version >= 3, feature bits */ + ceph_decode_32_safe(&p, end, len, bad); + ceph_decode_need(&p, end, len, bad); + memcpy(&features, p, min_t(size_t, len, sizeof(features))); + p += len; + } + mutex_lock(&mdsc->mutex); if (op == CEPH_SESSION_CLOSE) { get_session(session); @@ -2821,6 +3117,7 @@ static void handle_session(struct ceph_mds_session *session, if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect success\n", session->s_mds); session->s_state = CEPH_MDS_SESSION_OPEN; + session->s_features = features; renewed_caps(mdsc, session, 0); wake = 1; if (mdsc->stopping) @@ -2947,6 +3244,82 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, mutex_unlock(&mdsc->mutex); } +static int send_reconnect_partial(struct ceph_reconnect_state *recon_state) +{ + struct ceph_msg *reply; + struct ceph_pagelist *_pagelist; + struct page *page; + __le32 *addr; + int err = -ENOMEM; + + if (!recon_state->allow_multi) + return -ENOSPC; + + /* can't handle message that contains both caps and realm */ + BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms); + + /* pre-allocate new pagelist */ + _pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!_pagelist) + return -ENOMEM; + + reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); + if (!reply) + goto fail_msg; + + /* placeholder for nr_caps */ + err = ceph_pagelist_encode_32(_pagelist, 0); + if (err < 0) + goto fail; + + if (recon_state->nr_caps) { + /* currently encoding caps */ + err = ceph_pagelist_encode_32(recon_state->pagelist, 0); + if (err) + goto fail; + } else { + /* placeholder for nr_realms (currently encoding relams) */ + err = ceph_pagelist_encode_32(_pagelist, 0); + if (err < 0) + goto fail; + } + + err = ceph_pagelist_encode_8(recon_state->pagelist, 1); + if (err) + goto fail; + + page = list_first_entry(&recon_state->pagelist->head, struct page, lru); + addr = kmap_atomic(page); + if (recon_state->nr_caps) { + /* currently encoding caps */ + *addr = cpu_to_le32(recon_state->nr_caps); + } else { + /* currently encoding relams */ + *(addr + 1) = cpu_to_le32(recon_state->nr_realms); + } + kunmap_atomic(addr); + + reply->hdr.version = cpu_to_le16(5); + reply->hdr.compat_version = cpu_to_le16(4); + + reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length); + ceph_msg_data_add_pagelist(reply, recon_state->pagelist); + + ceph_con_send(&recon_state->session->s_con, reply); + ceph_pagelist_release(recon_state->pagelist); + + recon_state->pagelist = _pagelist; + recon_state->nr_caps = 0; + recon_state->nr_realms = 0; + recon_state->msg_version = 5; + return 0; +fail: + ceph_msg_put(reply); +fail_msg: + ceph_pagelist_release(_pagelist); + return err; +} + /* * Encode information about a cap for a reconnect with the MDS. */ @@ -2966,9 +3339,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, dout(" adding %p ino %llx.%llx cap %p %lld %s\n", inode, ceph_vinop(inode), cap, cap->cap_id, ceph_cap_string(cap->issued)); - err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); - if (err) - return err; spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ @@ -3008,7 +3378,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->msg_version >= 2) { int num_fcntl_locks, num_flock_locks; struct ceph_filelock *flocks = NULL; - size_t struct_len, total_len = 0; + size_t struct_len, total_len = sizeof(u64); u8 struct_v = 0; encode_again: @@ -3043,7 +3413,7 @@ encode_again: if (recon_state->msg_version >= 3) { /* version, compat_version and struct_len */ - total_len = 2 * sizeof(u8) + sizeof(u32); + total_len += 2 * sizeof(u8) + sizeof(u32); struct_v = 2; } /* @@ -3060,12 +3430,19 @@ encode_again: struct_len += sizeof(u64); /* snap_follows */ total_len += struct_len; - err = ceph_pagelist_reserve(pagelist, total_len); - if (err) { - kfree(flocks); - goto out_err; + + if (pagelist->length + total_len > RECONNECT_MAX_SIZE) { + err = send_reconnect_partial(recon_state); + if (err) + goto out_freeflocks; + pagelist = recon_state->pagelist; } + err = ceph_pagelist_reserve(pagelist, total_len); + if (err) + goto out_freeflocks; + + ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); if (recon_state->msg_version >= 3) { ceph_pagelist_encode_8(pagelist, struct_v); ceph_pagelist_encode_8(pagelist, 1); @@ -3077,7 +3454,7 @@ encode_again: num_fcntl_locks, num_flock_locks); if (struct_v >= 2) ceph_pagelist_encode_64(pagelist, snap_follows); - +out_freeflocks: kfree(flocks); } else { u64 pathbase = 0; @@ -3098,20 +3475,81 @@ encode_again: } err = ceph_pagelist_reserve(pagelist, - pathlen + sizeof(u32) + sizeof(rec.v1)); + sizeof(u64) + sizeof(u32) + + pathlen + sizeof(rec.v1)); if (err) { - kfree(path); - goto out_err; + goto out_freepath; } + ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); - +out_freepath: kfree(path); } - recon_state->nr_caps++; out_err: + if (err >= 0) + recon_state->nr_caps++; + return err; +} + +static int encode_snap_realms(struct ceph_mds_client *mdsc, + struct ceph_reconnect_state *recon_state) +{ + struct rb_node *p; + struct ceph_pagelist *pagelist = recon_state->pagelist; + int err = 0; + + if (recon_state->msg_version >= 4) { + err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms); + if (err < 0) + goto fail; + } + + /* + * snaprealms. we provide mds with the ino, seq (version), and + * parent for all of our realms. If the mds has any newer info, + * it will tell us. + */ + for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { + struct ceph_snap_realm *realm = + rb_entry(p, struct ceph_snap_realm, node); + struct ceph_mds_snaprealm_reconnect sr_rec; + + if (recon_state->msg_version >= 4) { + size_t need = sizeof(u8) * 2 + sizeof(u32) + + sizeof(sr_rec); + + if (pagelist->length + need > RECONNECT_MAX_SIZE) { + err = send_reconnect_partial(recon_state); + if (err) + goto fail; + pagelist = recon_state->pagelist; + } + + err = ceph_pagelist_reserve(pagelist, need); + if (err) + goto fail; + + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_32(pagelist, sizeof(sr_rec)); + } + + dout(" adding snap realm %llx seq %lld parent %llx\n", + realm->ino, realm->seq, realm->parent_ino); + sr_rec.ino = cpu_to_le64(realm->ino); + sr_rec.seq = cpu_to_le64(realm->seq); + sr_rec.parent = cpu_to_le64(realm->parent_ino); + + err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); + if (err) + goto fail; + + recon_state->nr_realms++; + } +fail: return err; } @@ -3132,18 +3570,17 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_msg *reply; - struct rb_node *p; int mds = session->s_mds; int err = -ENOMEM; - int s_nr_caps; - struct ceph_pagelist *pagelist; - struct ceph_reconnect_state recon_state; + struct ceph_reconnect_state recon_state = { + .session = session, + }; LIST_HEAD(dispose); pr_info("mds%d reconnect start\n", mds); - pagelist = ceph_pagelist_alloc(GFP_NOFS); - if (!pagelist) + recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!recon_state.pagelist) goto fail_nopagelist; reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); @@ -3187,63 +3624,90 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, /* replay unsafe requests */ replay_unsafe_requests(mdsc, session); + ceph_early_kick_flushing_caps(mdsc, session); + down_read(&mdsc->snap_rwsem); - /* traverse this session's caps */ - s_nr_caps = session->s_nr_caps; - err = ceph_pagelist_encode_32(pagelist, s_nr_caps); + /* placeholder for nr_caps */ + err = ceph_pagelist_encode_32(recon_state.pagelist, 0); if (err) goto fail; - recon_state.nr_caps = 0; - recon_state.pagelist = pagelist; - if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) + if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) { recon_state.msg_version = 3; - else + recon_state.allow_multi = true; + } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) { + recon_state.msg_version = 3; + } else { recon_state.msg_version = 2; + } + /* trsaverse this session's caps */ err = iterate_session_caps(session, encode_caps_cb, &recon_state); - if (err < 0) - goto fail; spin_lock(&session->s_cap_lock); session->s_cap_reconnect = 0; spin_unlock(&session->s_cap_lock); - /* - * snaprealms. we provide mds with the ino, seq (version), and - * parent for all of our realms. If the mds has any newer info, - * it will tell us. - */ - for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { - struct ceph_snap_realm *realm = - rb_entry(p, struct ceph_snap_realm, node); - struct ceph_mds_snaprealm_reconnect sr_rec; + if (err < 0) + goto fail; - dout(" adding snap realm %llx seq %lld parent %llx\n", - realm->ino, realm->seq, realm->parent_ino); - sr_rec.ino = cpu_to_le64(realm->ino); - sr_rec.seq = cpu_to_le64(realm->seq); - sr_rec.parent = cpu_to_le64(realm->parent_ino); - err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); - if (err) - goto fail; + /* check if all realms can be encoded into current message */ + if (mdsc->num_snap_realms) { + size_t total_len = + recon_state.pagelist->length + + mdsc->num_snap_realms * + sizeof(struct ceph_mds_snaprealm_reconnect); + if (recon_state.msg_version >= 4) { + /* number of realms */ + total_len += sizeof(u32); + /* version, compat_version and struct_len */ + total_len += mdsc->num_snap_realms * + (2 * sizeof(u8) + sizeof(u32)); + } + if (total_len > RECONNECT_MAX_SIZE) { + if (!recon_state.allow_multi) { + err = -ENOSPC; + goto fail; + } + if (recon_state.nr_caps) { + err = send_reconnect_partial(&recon_state); + if (err) + goto fail; + } + recon_state.msg_version = 5; + } } - reply->hdr.version = cpu_to_le16(recon_state.msg_version); + err = encode_snap_realms(mdsc, &recon_state); + if (err < 0) + goto fail; - /* raced with cap release? */ - if (s_nr_caps != recon_state.nr_caps) { - struct page *page = list_first_entry(&pagelist->head, - struct page, lru); + if (recon_state.msg_version >= 5) { + err = ceph_pagelist_encode_8(recon_state.pagelist, 0); + if (err < 0) + goto fail; + } + + if (recon_state.nr_caps || recon_state.nr_realms) { + struct page *page = + list_first_entry(&recon_state.pagelist->head, + struct page, lru); __le32 *addr = kmap_atomic(page); - *addr = cpu_to_le32(recon_state.nr_caps); + if (recon_state.nr_caps) { + WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms); + *addr = cpu_to_le32(recon_state.nr_caps); + } else if (recon_state.msg_version >= 4) { + *(addr + 1) = cpu_to_le32(recon_state.nr_realms); + } kunmap_atomic(addr); } - reply->hdr.data_len = cpu_to_le32(pagelist->length); - ceph_msg_data_add_pagelist(reply, pagelist); + reply->hdr.version = cpu_to_le16(recon_state.msg_version); + if (recon_state.msg_version >= 4) + reply->hdr.compat_version = cpu_to_le16(4); - ceph_early_kick_flushing_caps(mdsc, session); + reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length); + ceph_msg_data_add_pagelist(reply, recon_state.pagelist); ceph_con_send(&session->s_con, reply); @@ -3254,7 +3718,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, mutex_unlock(&mdsc->mutex); up_read(&mdsc->snap_rwsem); - ceph_pagelist_release(pagelist); + ceph_pagelist_release(recon_state.pagelist); return; fail: @@ -3262,7 +3726,7 @@ fail: up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); fail_nomsg: - ceph_pagelist_release(pagelist); + ceph_pagelist_release(recon_state.pagelist); fail_nopagelist: pr_err("error %d preparing reconnect for mds%d\n", err, mds); return; @@ -3580,7 +4044,6 @@ static void delayed_work(struct work_struct *work) int renew_caps; dout("mdsc delayed_work\n"); - ceph_check_delayed_caps(mdsc); mutex_lock(&mdsc->mutex); renew_interval = mdsc->mdsmap->m_session_timeout >> 2; @@ -3628,6 +4091,12 @@ static void delayed_work(struct work_struct *work) } mutex_unlock(&mdsc->mutex); + ceph_check_delayed_caps(mdsc); + + ceph_queue_cap_reclaim_work(mdsc); + + ceph_trim_snapid_map(mdsc); + schedule_delayed(mdsc); } @@ -3660,6 +4129,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); + mdsc->num_snap_realms = 0; spin_lock_init(&mdsc->snap_empty_lock); mdsc->last_tid = 0; mdsc->oldest_tid = 0; @@ -3677,11 +4147,19 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); - spin_lock_init(&mdsc->dentry_lru_lock); - INIT_LIST_HEAD(&mdsc->dentry_lru); + INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); + atomic_set(&mdsc->cap_reclaim_pending, 0); + + spin_lock_init(&mdsc->dentry_list_lock); + INIT_LIST_HEAD(&mdsc->dentry_leases); + INIT_LIST_HEAD(&mdsc->dentry_dir_leases); ceph_caps_init(mdsc); - ceph_adjust_min_caps(mdsc, fsc->min_caps); + ceph_adjust_caps_max_min(mdsc, fsc->mount_options); + + spin_lock_init(&mdsc->snapid_map_lock); + mdsc->snapid_map_tree = RB_ROOT; + INIT_LIST_HEAD(&mdsc->snapid_map_lru); init_rwsem(&mdsc->pool_perm_rwsem); mdsc->pool_perm_tree = RB_ROOT; @@ -3876,8 +4354,10 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) WARN_ON(!list_empty(&mdsc->cap_delay_list)); mutex_unlock(&mdsc->mutex); + ceph_cleanup_snapid_map(mdsc); ceph_cleanup_empty_realms(mdsc); + cancel_work_sync(&mdsc->cap_reclaim_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ dout("stopped\n"); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 729da155ebf0..50385a481fdb 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -21,11 +21,14 @@ #define CEPHFS_FEATURE_REPLY_ENCODING 9 #define CEPHFS_FEATURE_RECLAIM_CLIENT 10 #define CEPHFS_FEATURE_LAZY_CAP_WANTED 11 +#define CEPHFS_FEATURE_MULTI_RECONNECT 12 #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 0, 1, 2, 3, 4, 5, 6, 7, \ CEPHFS_FEATURE_MIMIC, \ + CEPHFS_FEATURE_REPLY_ENCODING, \ CEPHFS_FEATURE_LAZY_CAP_WANTED, \ + CEPHFS_FEATURE_MULTI_RECONNECT, \ } #define CEPHFS_FEATURES_CLIENT_REQUIRED {} @@ -65,6 +68,7 @@ struct ceph_mds_reply_info_in { char *pool_ns_data; u64 max_bytes; u64 max_files; + s32 dir_pin; }; struct ceph_mds_reply_dir_entry { @@ -152,6 +156,7 @@ struct ceph_mds_session { int s_mds; int s_state; unsigned long s_ttl; /* time until mds kills us */ + unsigned long s_features; u64 s_seq; /* incoming msg seq # */ struct mutex s_mutex; /* serialize session messages */ @@ -167,19 +172,20 @@ struct ceph_mds_session { /* protected by s_cap_lock */ spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ + struct ceph_cap *s_cap_iterator; int s_nr_caps, s_trim_caps; int s_num_cap_releases; int s_cap_reconnect; int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ - struct ceph_cap *s_cap_iterator; + struct work_struct s_cap_release_work; /* protected by mutex */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; - refcount_t s_ref; + refcount_t s_ref; struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ }; @@ -310,6 +316,15 @@ struct ceph_pool_perm { char pool_ns[]; }; +struct ceph_snapid_map { + struct rb_node node; + struct list_head lru; + atomic_t ref; + u64 snap; + dev_t dev; + unsigned long last_used; +}; + /* * mds client state */ @@ -341,6 +356,7 @@ struct ceph_mds_client { struct rw_semaphore snap_rwsem; struct rb_root snap_realms; struct list_head snap_empty; + int num_snap_realms; spinlock_t snap_empty_lock; /* protect snap_empty */ u64 last_tid; /* most recent mds request */ @@ -362,6 +378,9 @@ struct ceph_mds_client { spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; + struct work_struct cap_reclaim_work; + atomic_t cap_reclaim_pending; + /* * Cap reservations * @@ -378,13 +397,18 @@ struct ceph_mds_client { unreserved) */ int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ + int caps_use_max; /* max used caps */ int caps_reserve_count; /* unused, reserved */ int caps_avail_count; /* unused, unreserved */ int caps_min_count; /* keep at least this many (unreserved) */ - spinlock_t dentry_lru_lock; - struct list_head dentry_lru; - int num_dentry; + spinlock_t dentry_list_lock; + struct list_head dentry_leases; /* fifo list */ + struct list_head dentry_dir_leases; /* lru list */ + + spinlock_t snapid_map_lock; + struct rb_root snapid_map_tree; + struct list_head snapid_map_lru; struct rw_semaphore pool_perm_rwsem; struct rb_root pool_perm_tree; @@ -438,9 +462,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } -extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); - +extern void __ceph_queue_cap_release(struct ceph_mds_session *session, + struct ceph_cap *cap); +extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); +extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index f74193da0e09..b26e12cd8ec3 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -3,12 +3,13 @@ #include <linux/sort.h> #include <linux/slab.h> - #include "super.h" #include "mds_client.h" - #include <linux/ceph/decode.h> +/* unused map expires after 5 minutes */ +#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ) + /* * Snapshots in ceph are driven in large part by cooperation from the * client. In contrast to local file systems or file servers that @@ -124,6 +125,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm( INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); + mdsc->num_snap_realms++; + dout("create_snap_realm %llx %p\n", realm->ino, realm); return realm; } @@ -175,6 +178,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc, dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); + mdsc->num_snap_realms--; if (realm->parent) { list_del_init(&realm->child_item); @@ -568,7 +572,12 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) old_snapc = NULL; update_snapc: - if (ci->i_head_snapc) { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); dout(" new snapc is %p\n", new_snapc); } @@ -986,3 +995,154 @@ out: up_write(&mdsc->snap_rwsem); return; } + +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, + u64 snap) +{ + struct ceph_snapid_map *sm, *exist; + struct rb_node **p, *parent; + int ret; + + exist = NULL; + spin_lock(&mdsc->snapid_map_lock); + p = &mdsc->snapid_map_tree.rb_node; + while (*p) { + exist = rb_entry(*p, struct ceph_snapid_map, node); + if (snap > exist->snap) { + p = &(*p)->rb_left; + } else if (snap < exist->snap) { + p = &(*p)->rb_right; + } else { + if (atomic_inc_return(&exist->ref) == 1) + list_del_init(&exist->lru); + break; + } + exist = NULL; + } + spin_unlock(&mdsc->snapid_map_lock); + if (exist) { + dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + return exist; + } + + sm = kmalloc(sizeof(*sm), GFP_NOFS); + if (!sm) + return NULL; + + ret = get_anon_bdev(&sm->dev); + if (ret < 0) { + kfree(sm); + return NULL; + } + + INIT_LIST_HEAD(&sm->lru); + atomic_set(&sm->ref, 1); + sm->snap = snap; + + exist = NULL; + parent = NULL; + p = &mdsc->snapid_map_tree.rb_node; + spin_lock(&mdsc->snapid_map_lock); + while (*p) { + parent = *p; + exist = rb_entry(*p, struct ceph_snapid_map, node); + if (snap > exist->snap) + p = &(*p)->rb_left; + else if (snap < exist->snap) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + if (exist) { + if (atomic_inc_return(&exist->ref) == 1) + list_del_init(&exist->lru); + } else { + rb_link_node(&sm->node, parent, p); + rb_insert_color(&sm->node, &mdsc->snapid_map_tree); + } + spin_unlock(&mdsc->snapid_map_lock); + if (exist) { + free_anon_bdev(sm->dev); + kfree(sm); + dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + return exist; + } + + dout("create snapid map %llx -> %x\n", sm->snap, sm->dev); + return sm; +} + +void ceph_put_snapid_map(struct ceph_mds_client* mdsc, + struct ceph_snapid_map *sm) +{ + if (!sm) + return; + if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) { + if (!RB_EMPTY_NODE(&sm->node)) { + sm->last_used = jiffies; + list_add_tail(&sm->lru, &mdsc->snapid_map_lru); + spin_unlock(&mdsc->snapid_map_lock); + } else { + /* already cleaned up by + * ceph_cleanup_snapid_map() */ + spin_unlock(&mdsc->snapid_map_lock); + kfree(sm); + } + } +} + +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc) +{ + struct ceph_snapid_map *sm; + unsigned long now; + LIST_HEAD(to_free); + + spin_lock(&mdsc->snapid_map_lock); + now = jiffies; + + while (!list_empty(&mdsc->snapid_map_lru)) { + sm = list_first_entry(&mdsc->snapid_map_lru, + struct ceph_snapid_map, lru); + if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now)) + break; + + rb_erase(&sm->node, &mdsc->snapid_map_tree); + list_move(&sm->lru, &to_free); + } + spin_unlock(&mdsc->snapid_map_lock); + + while (!list_empty(&to_free)) { + sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); + list_del(&sm->lru); + dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev); + free_anon_bdev(sm->dev); + kfree(sm); + } +} + +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc) +{ + struct ceph_snapid_map *sm; + struct rb_node *p; + LIST_HEAD(to_free); + + spin_lock(&mdsc->snapid_map_lock); + while ((p = rb_first(&mdsc->snapid_map_tree))) { + sm = rb_entry(p, struct ceph_snapid_map, node); + rb_erase(p, &mdsc->snapid_map_tree); + RB_CLEAR_NODE(p); + list_move(&sm->lru, &to_free); + } + spin_unlock(&mdsc->snapid_map_lock); + + while (!list_empty(&to_free)) { + sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); + list_del(&sm->lru); + free_anon_bdev(sm->dev); + if (WARN_ON_ONCE(atomic_read(&sm->ref))) { + pr_err("snapid map %llx -> %x still in use\n", + sm->snap, sm->dev); + } + } +} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index da2cd8e89062..6d5bb2f74612 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -133,6 +133,7 @@ enum { Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, + Opt_caps_max, Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, @@ -175,6 +176,7 @@ static match_table_t fsopt_tokens = { {Opt_rasize, "rasize=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, + {Opt_caps_max, "caps_max=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, @@ -286,6 +288,11 @@ static int parse_fsopt_token(char *c, void *private) return -EINVAL; fsopt->caps_wanted_delay_max = intval; break; + case Opt_caps_max: + if (intval < 0) + return -EINVAL; + fsopt->caps_max = intval; + break; case Opt_readdir_max_entries: if (intval < 1) return -EINVAL; @@ -576,6 +583,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",rasize=%d", fsopt->rasize); if (fsopt->congestion_kb != default_congestion_kb()) seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + if (fsopt->caps_max) + seq_printf(m, ",caps_max=%d", fsopt->caps_max); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) seq_printf(m, ",caps_wanted_delay_min=%d", fsopt->caps_wanted_delay_min); @@ -671,6 +680,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); if (!fsc->trunc_wq) goto fail_pg_inv_wq; + fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); + if (!fsc->cap_wq) + goto fail_trunc_wq; /* set up mempools */ err = -ENOMEM; @@ -678,13 +690,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, size = sizeof (struct page *) * (page_count ? page_count : 1); fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); if (!fsc->wb_pagevec_pool) - goto fail_trunc_wq; - - /* caps */ - fsc->min_caps = fsopt->max_readdir; + goto fail_cap_wq; return fsc; +fail_cap_wq: + destroy_workqueue(fsc->cap_wq); fail_trunc_wq: destroy_workqueue(fsc->trunc_wq); fail_pg_inv_wq: @@ -706,6 +717,7 @@ static void flush_fs_workqueues(struct ceph_fs_client *fsc) flush_workqueue(fsc->wb_wq); flush_workqueue(fsc->pg_inv_wq); flush_workqueue(fsc->trunc_wq); + flush_workqueue(fsc->cap_wq); } static void destroy_fs_client(struct ceph_fs_client *fsc) @@ -715,6 +727,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) destroy_workqueue(fsc->wb_wq); destroy_workqueue(fsc->pg_inv_wq); destroy_workqueue(fsc->trunc_wq); + destroy_workqueue(fsc->cap_wq); mempool_destroy(fsc->wb_pagevec_pool); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index dfb64a5211b6..16c03188578e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -79,6 +79,7 @@ struct ceph_mount_options { int rasize; /* max readahead */ int congestion_kb; /* max writeback in flight */ int caps_wanted_delay_min, caps_wanted_delay_max; + int caps_max; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ @@ -100,17 +101,18 @@ struct ceph_fs_client { struct ceph_client *client; unsigned long mount_state; - int min_caps; /* min caps i added */ loff_t max_file_size; struct ceph_mds_client *mdsc; /* writeback */ mempool_t *wb_pagevec_pool; + atomic_long_t writeback_count; + struct workqueue_struct *wb_wq; struct workqueue_struct *pg_inv_wq; struct workqueue_struct *trunc_wq; - atomic_long_t writeback_count; + struct workqueue_struct *cap_wq; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; @@ -260,17 +262,22 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { + struct dentry *dentry; struct ceph_mds_session *lease_session; + struct list_head lease_list; + unsigned flags; int lease_shared_gen; u32 lease_gen; u32 lease_seq; unsigned long lease_renew_after, lease_renew_from; - struct list_head lru; - struct dentry *dentry; unsigned long time; u64 offset; }; +#define CEPH_DENTRY_REFERENCED 1 +#define CEPH_DENTRY_LEASE_LIST 2 +#define CEPH_DENTRY_SHRINK_LIST 4 + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -318,6 +325,8 @@ struct ceph_inode_info { /* quotas */ u64 i_max_bytes, i_max_files; + s32 i_dir_pin; + struct rb_root i_fragtree; int i_fragtree_nsplits; struct mutex i_fragtree_mutex; @@ -370,7 +379,10 @@ struct ceph_inode_info { struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ spinlock_t i_unsafe_lock; - struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ + union { + struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ + struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */ + }; int i_snap_realm_counter; /* snap realm (if caps) */ struct list_head i_snap_realm_item; struct list_head i_snap_flush_item; @@ -587,7 +599,7 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found); -static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) +static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry) { return (struct ceph_dentry_info *)dentry->d_fsdata; } @@ -656,7 +668,8 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); -extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); +extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, + struct ceph_mount_options *fsopt); extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc, @@ -837,6 +850,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, + u64 snap); +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc, + struct ceph_snapid_map *sm); +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc); +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc); + + /* * a cap_snap is "pending" if it is still awaiting an in-progress * sync write (that may/may not still update size, mtime, etc.). @@ -975,11 +996,11 @@ extern void ceph_add_cap(struct inode *inode, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); +extern void __ceph_remove_caps(struct inode* inode); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); extern int ceph_is_any_caps(struct inode *inode); -extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync); @@ -1049,10 +1070,10 @@ extern int ceph_handle_snapdir(struct ceph_mds_request *req, extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); -extern void ceph_dentry_lru_add(struct dentry *dn); -extern void ceph_dentry_lru_touch(struct dentry *dn); -extern void ceph_dentry_lru_del(struct dentry *dn); +extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di); +extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern int ceph_trim_dentries(struct ceph_mds_client *mdsc); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 316f6ad10644..0cc42c8879e9 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -228,8 +228,19 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, ci->i_rctime.tv_nsec); } -/* quotas */ +/* dir pin */ +static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci) +{ + return ci->i_dir_pin != -ENODATA; +} + +static size_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%d", (int)ci->i_dir_pin); +} +/* quotas */ static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) { bool ret = false; @@ -315,6 +326,13 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rbytes), XATTR_RSTAT_FIELD(dir, rctime), { + .name = "ceph.dir.pin", + .name_size = sizeof("ceph.dir_pin"), + .getxattr_cb = ceph_vxattrcb_dir_pin, + .exists_cb = ceph_vxattrcb_dir_pin_exists, + .flags = VXATTR_FLAG_HIDDEN, + }, + { .name = "ceph.quota", .name_size = sizeof("ceph.quota"), .getxattr_cb = ceph_vxattrcb_quota, diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index f1ddc9d03c10..76724efc831c 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -117,25 +117,25 @@ config CIFS_UPCALL secure Kerberos authentication is required). If unsure, say Y. config CIFS_XATTR - bool "CIFS extended attributes" - depends on CIFS - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page for details). - CIFS maps the name of extended attributes beginning with the user - namespace prefix to SMB/CIFS EAs. EAs are stored on Windows - servers without the user namespace prefix, but their names are - seen by Linux cifs clients prefaced by the user namespace prefix. - The system namespace (used by some filesystems to store ACLs) is - not supported at this time. - - If unsure, say Y. + bool "CIFS extended attributes" + depends on CIFS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page for details). + CIFS maps the name of extended attributes beginning with the user + namespace prefix to SMB/CIFS EAs. EAs are stored on Windows + servers without the user namespace prefix, but their names are + seen by Linux cifs clients prefaced by the user namespace prefix. + The system namespace (used by some filesystems to store ACLs) is + not supported at this time. + + If unsure, say Y. config CIFS_POSIX - bool "CIFS POSIX Extensions" - depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR - help - Enabling this option will cause the cifs client to attempt to + bool "CIFS POSIX Extensions" + depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR + help + Enabling this option will cause the cifs client to attempt to negotiate a newer dialect with servers, such as Samba 3.0.5 or later, that optionally can handle more POSIX like (rather than Windows like) file behavior. It also enables @@ -144,61 +144,62 @@ config CIFS_POSIX CIFS POSIX ACL support. If unsure, say N. config CIFS_ACL - bool "Provide CIFS ACL support" - depends on CIFS_XATTR && KEYS - help - Allows fetching CIFS/NTFS ACL from the server. The DACL blob - is handed over to the application/caller. See the man - page for getcifsacl for more information. If unsure, say Y. + bool "Provide CIFS ACL support" + depends on CIFS_XATTR && KEYS + help + Allows fetching CIFS/NTFS ACL from the server. The DACL blob + is handed over to the application/caller. See the man + page for getcifsacl for more information. If unsure, say Y. config CIFS_DEBUG bool "Enable CIFS debugging routines" default y depends on CIFS help - Enabling this option adds helpful debugging messages to - the cifs code which increases the size of the cifs module. - If unsure, say Y. + Enabling this option adds helpful debugging messages to + the cifs code which increases the size of the cifs module. + If unsure, say Y. + config CIFS_DEBUG2 bool "Enable additional CIFS debugging routines" depends on CIFS_DEBUG help - Enabling this option adds a few more debugging routines - to the cifs code which slightly increases the size of - the cifs module and can cause additional logging of debug - messages in some error paths, slowing performance. This - option can be turned off unless you are debugging - cifs problems. If unsure, say N. + Enabling this option adds a few more debugging routines + to the cifs code which slightly increases the size of + the cifs module and can cause additional logging of debug + messages in some error paths, slowing performance. This + option can be turned off unless you are debugging + cifs problems. If unsure, say N. config CIFS_DEBUG_DUMP_KEYS bool "Dump encryption keys for offline decryption (Unsafe)" depends on CIFS_DEBUG help - Enabling this will dump the encryption and decryption keys - used to communicate on an encrypted share connection on the - console. This allows Wireshark to decrypt and dissect - encrypted network captures. Enable this carefully. - If unsure, say N. + Enabling this will dump the encryption and decryption keys + used to communicate on an encrypted share connection on the + console. This allows Wireshark to decrypt and dissect + encrypted network captures. Enable this carefully. + If unsure, say N. config CIFS_DFS_UPCALL - bool "DFS feature support" - depends on CIFS && KEYS - select DNS_RESOLVER - help - Distributed File System (DFS) support is used to access shares - transparently in an enterprise name space, even if the share - moves to a different server. This feature also enables - an upcall mechanism for CIFS which contacts userspace helper - utilities to provide server name resolution (host names to - IP addresses) which is needed in order to reconnect to - servers if their addresses change or for implicit mounts of - DFS junction points. If unsure, say Y. + bool "DFS feature support" + depends on CIFS && KEYS + select DNS_RESOLVER + help + Distributed File System (DFS) support is used to access shares + transparently in an enterprise name space, even if the share + moves to a different server. This feature also enables + an upcall mechanism for CIFS which contacts userspace helper + utilities to provide server name resolution (host names to + IP addresses) which is needed in order to reconnect to + servers if their addresses change or for implicit mounts of + DFS junction points. If unsure, say Y. config CIFS_NFSD_EXPORT - bool "Allow nfsd to export CIFS file system" - depends on CIFS && BROKEN - help - Allows NFS server to export a CIFS mounted share (nfsd over cifs) + bool "Allow nfsd to export CIFS file system" + depends on CIFS && BROKEN + help + Allows NFS server to export a CIFS mounted share (nfsd over cifs) config CIFS_SMB_DIRECT bool "SMB Direct support (Experimental)" @@ -209,10 +210,9 @@ config CIFS_SMB_DIRECT say N. config CIFS_FSCACHE - bool "Provide CIFS client caching support" - depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y - help - Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data - to be cached locally on disk through the general filesystem cache - manager. If unsure, say N. - + bool "Provide CIFS client caching support" + depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y + help + Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data + to be cached locally on disk through the general filesystem cache + manager. If unsure, say N. diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index e92a2fee3c57..13c1288b04a7 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -115,7 +115,12 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_puts(m, " type: CDROM "); else seq_printf(m, " type: %d ", dev_type); - if (tcon->seal) + + seq_printf(m, "Serial Number: 0x%x", tcon->vol_serial_number); + + if ((tcon->seal) || + (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) || + (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)) seq_printf(m, " Encrypted"); if (tcon->nocase) seq_printf(m, " nocase"); @@ -371,6 +376,10 @@ skip_rdma: atomic_read(&server->in_send), atomic_read(&server->num_waiters)); #endif + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + seq_puts(m, " encrypted"); + if (ses->sign) + seq_puts(m, " signed"); seq_puts(m, "\n\tShares:"); j = 0; diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index d9b99abe1243..5d83c924cc47 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -285,9 +285,9 @@ static void dump_referral(const struct dfs_info3_param *ref) { cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name); cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name); - cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n", + cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n", ref->flags, ref->server_type); - cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n", + cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n", ref->ref_flag, ref->path_consumed); } diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 42f0d67f1054..ed49222abecb 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -58,6 +58,7 @@ struct cifs_sb_info { spinlock_t tlink_tree_lock; struct tcon_link *master_tlink; struct nls_table *local_nls; + unsigned int bsize; unsigned int rsize; unsigned int wsize; unsigned long actimeo; /* attribute cache timeout (jiffies) */ diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index d8bce2f862de..086ddc5108af 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -43,6 +43,9 @@ struct smb_snapshot_array { /* snapshots[]; */ } __packed; +/* query_info flags */ +#define PASSTHRU_QUERY_INFO 0x00000000 +#define PASSTHRU_FSCTL 0x00000001 struct smb_query_info { __u32 info_type; __u32 file_info_class; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 62d48d486d8f..a05bf1d6e1d0 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -381,7 +381,7 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses) seq_puts(s, "ntlm"); break; case Kerberos: - seq_puts(s, "krb5"); + seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid)); break; case RawNTLMSSP: seq_puts(s, "ntlmssp"); @@ -554,10 +554,13 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rsize=%u", cifs_sb->rsize); seq_printf(s, ",wsize=%u", cifs_sb->wsize); + seq_printf(s, ",bsize=%u", cifs_sb->bsize); seq_printf(s, ",echo_interval=%lu", tcon->ses->server->echo_interval / HZ); if (tcon->snapshot_time) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); + if (tcon->handle_timeout) + seq_printf(s, ",handletimeout=%u", tcon->handle_timeout); /* convert actimeo and display it in seconds */ seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); @@ -1007,7 +1010,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, unsigned int xid; int rc; - if (remap_flags & ~REMAP_FILE_ADVISORY) + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; cifs_dbg(FYI, "clone range\n"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7652551a1fc4..5c0298b9998f 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.17" +#define CIFS_VERSION "2.19" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 94dbdbe5be34..585ad3207cb1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -60,6 +60,12 @@ #define CIFS_MAX_ACTIMEO (1 << 30) /* + * Max persistent and resilient handle timeout (milliseconds). + * Windows durable max was 960000 (16 minutes) + */ +#define SMB3_MAX_HANDLE_TIMEOUT 960000 + +/* * MAX_REQ is the maximum number of requests that WE will send * on one socket concurrently. */ @@ -216,6 +222,7 @@ struct cifs_io_parms; struct cifs_search_info; struct cifsInodeInfo; struct cifs_open_parms; +struct cifs_credits; struct smb_version_operations { int (*send_cancel)(struct TCP_Server_Info *, struct smb_rqst *, @@ -230,12 +237,15 @@ struct smb_version_operations { /* check response: verify signature, map error */ int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *, bool); - void (*add_credits)(struct TCP_Server_Info *, const unsigned int, - const int); + void (*add_credits)(struct TCP_Server_Info *server, + const struct cifs_credits *credits, + const int optype); void (*set_credits)(struct TCP_Server_Info *, const int); int * (*get_credits_field)(struct TCP_Server_Info *, const int); unsigned int (*get_credits)(struct mid_q_entry *); __u64 (*get_next_mid)(struct TCP_Server_Info *); + void (*revert_current_mid)(struct TCP_Server_Info *server, + const unsigned int val); /* data offset from read response message */ unsigned int (*read_data_offset)(char *); /* @@ -383,8 +393,8 @@ struct smb_version_operations { struct cifs_fid *); /* calculate a size of SMB message */ unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi); - /* check for STATUS_PENDING and process it in a positive case */ - bool (*is_status_pending)(char *, struct TCP_Server_Info *, int); + /* check for STATUS_PENDING and process the response if yes */ + bool (*is_status_pending)(char *buf, struct TCP_Server_Info *server); /* check for STATUS_NETWORK_SESSION_EXPIRED */ bool (*is_session_expired)(char *); /* send oplock break response */ @@ -452,7 +462,11 @@ struct smb_version_operations { unsigned int (*wp_retry_size)(struct inode *); /* get mtu credits */ int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int, - unsigned int *, unsigned int *); + unsigned int *, struct cifs_credits *); + /* adjust previously taken mtu credits to request size */ + int (*adjust_credits)(struct TCP_Server_Info *server, + struct cifs_credits *credits, + const unsigned int payload_size); /* check if we need to issue closedir */ bool (*dir_needs_close)(struct cifsFileInfo *); long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t, @@ -471,6 +485,14 @@ struct smb_version_operations { struct cifs_tcon *tcon, __le16 *path, int is_dir, unsigned long p); + /* make unix special files (block, char, fifo, socket) */ + int (*make_node)(unsigned int xid, + struct inode *inode, + struct dentry *dentry, + struct cifs_tcon *tcon, + char *full_path, + umode_t mode, + dev_t device_number); }; struct smb_version_values { @@ -557,6 +579,7 @@ struct smb_vol { bool resilient:1; /* noresilient not required since not fored for CA */ bool domainauto:1; bool rdma:1; + unsigned int bsize; unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -569,6 +592,7 @@ struct smb_vol { struct nls_table *local_nls; unsigned int echo_interval; /* echo interval in secs */ __u64 snapshot_time; /* needed for timewarp tokens */ + __u32 handle_timeout; /* persistent and durable handle timeout in ms */ unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ }; @@ -710,6 +734,11 @@ struct TCP_Server_Info { int nr_targets; }; +struct cifs_credits { + unsigned int value; + unsigned int instance; +}; + static inline unsigned int in_flight(struct TCP_Server_Info *server) { @@ -721,28 +750,28 @@ in_flight(struct TCP_Server_Info *server) } static inline bool -has_credits(struct TCP_Server_Info *server, int *credits) +has_credits(struct TCP_Server_Info *server, int *credits, int num_credits) { int num; spin_lock(&server->req_lock); num = *credits; spin_unlock(&server->req_lock); - return num > 0; + return num >= num_credits; } static inline void -add_credits(struct TCP_Server_Info *server, const unsigned int add, +add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits, const int optype) { - server->ops->add_credits(server, add, optype); + server->ops->add_credits(server, credits, optype); } static inline void -add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add, - const int optype) +add_credits_and_wake_if(struct TCP_Server_Info *server, + const struct cifs_credits *credits, const int optype) { - if (add) { - server->ops->add_credits(server, add, optype); + if (credits->value) { + server->ops->add_credits(server, credits, optype); wake_up(&server->request_q); } } @@ -753,6 +782,14 @@ set_credits(struct TCP_Server_Info *server, const int val) server->ops->set_credits(server, val); } +static inline int +adjust_credits(struct TCP_Server_Info *server, struct cifs_credits *credits, + const unsigned int payload_size) +{ + return server->ops->adjust_credits ? + server->ops->adjust_credits(server, credits, payload_size) : 0; +} + static inline __le64 get_next_mid64(struct TCP_Server_Info *server) { @@ -770,6 +807,22 @@ get_next_mid(struct TCP_Server_Info *server) return cpu_to_le16(mid); } +static inline void +revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) +{ + if (server->ops->revert_current_mid) + server->ops->revert_current_mid(server, val); +} + +static inline void +revert_current_mid_from_hdr(struct TCP_Server_Info *server, + const struct smb2_sync_hdr *shdr) +{ + unsigned int num = le16_to_cpu(shdr->CreditCharge); + + return revert_current_mid(server, num > 0 ? num : 1); +} + static inline __u16 get_mid(const struct smb_hdr *smb) { @@ -924,11 +977,14 @@ cap_unix(struct cifs_ses *ses) struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ + bool file_all_info_is_valid:1; + struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; struct cifs_tcon *tcon; struct work_struct lease_break; + struct smb2_file_all_info file_all_info; }; /* @@ -1009,6 +1065,7 @@ struct cifs_tcon { __u32 vol_serial_number; __le64 vol_create_time; __u64 snapshot_time; /* for timewarp tokens - timestamp of snapshot */ + __u32 handle_timeout; /* persistent and durable handle timeout in ms */ __u32 ss_flags; /* sector size flags */ __u32 perf_sector_size; /* best sector size for perf */ __u32 max_chunks; @@ -1234,7 +1291,7 @@ struct cifs_readdata { unsigned int pagesz; unsigned int page_offset; unsigned int tailsz; - unsigned int credits; + struct cifs_credits credits; unsigned int nr_pages; struct page **pages; }; @@ -1260,7 +1317,7 @@ struct cifs_writedata { unsigned int pagesz; unsigned int page_offset; unsigned int tailsz; - unsigned int credits; + struct cifs_credits credits; unsigned int nr_pages; struct page **pages; }; @@ -1276,6 +1333,7 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) } struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_READ_FLG 1 @@ -1422,6 +1480,7 @@ struct mid_q_entry { struct kref refcount; struct TCP_Server_Info *server; /* server corresponding to this mid */ __u64 mid; /* multiplex id */ + __u16 credits; /* number of credits consumed by this mid */ __u32 pid; /* process id */ __u32 sequence_number; /* for CIFS signing */ unsigned long when_alloc; /* when mid was created */ @@ -1696,6 +1755,7 @@ require use of the stronger protocol */ * GlobalMid_Lock protects: * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers + * list operations on global DnotifyReqList * tcp_ses_lock protects: * list operations on tcp and SMB session lists * tcon->open_file_lock protects the list of open files hanging off the tcon @@ -1796,6 +1856,7 @@ GLOBAL_EXTERN spinlock_t gidsidlock; #endif /* CONFIG_CIFS_ACL */ void cifs_oplock_break(struct work_struct *work); +void cifs_queue_oplock_break(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 336c116995d7..4f96b3b00a7a 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -93,7 +93,8 @@ extern int cifs_discard_remaining_data(struct TCP_Server_Info *server); extern int cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, mid_receive_t *receive, mid_callback_t *callback, - mid_handle_t *handle, void *cbdata, const int flags); + mid_handle_t *handle, void *cbdata, const int flags, + const struct cifs_credits *exist_credits); extern int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, struct smb_rqst *rqst, int *resp_buf_type, const int flags, struct kvec *resp_iov); @@ -115,7 +116,7 @@ extern int cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error); extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, unsigned int *num, - unsigned int *credits); + struct cifs_credits *credits); extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec *, int /* nvec to send */, int * /* type of buf returned */, const int flags, @@ -133,6 +134,9 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); +extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, + bool fsuid_only, + struct cifsFileInfo **ret_file); extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server); extern int decode_negTokenInit(unsigned char *security_blob, int length, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index bb54ccf8481c..f43747c062a7 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -139,8 +139,8 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, return -ENOMEM; if (tcon->ipc) { - snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", - tcon->ses->server->hostname); + scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", + tcon->ses->server->hostname); rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); goto out; } @@ -172,7 +172,7 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, continue; } - snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); + scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); if (!rc) @@ -822,9 +822,10 @@ static void cifs_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; + struct cifs_credits credits = { .value = 1, .instance = 0 }; DeleteMidQEntry(mid); - add_credits(server, 1, CIFS_ECHO_OP); + add_credits(server, &credits, CIFS_ECHO_OP); } int @@ -859,7 +860,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server) iov[1].iov_base = (char *)smb + 4; rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, NULL, - server, CIFS_ASYNC_OP | CIFS_ECHO_OP); + server, CIFS_ASYNC_OP | CIFS_ECHO_OP, NULL); if (rc) cifs_dbg(FYI, "Echo request failed: %d\n", rc); @@ -1605,16 +1606,17 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) } if (server->ops->is_status_pending && - server->ops->is_status_pending(buf, server, 0)) { + server->ops->is_status_pending(buf, server)) { cifs_discard_remaining_data(server); return -1; } /* set up first two iov for signature check and to get credits */ rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->total_read - 4; + rdata->iov[0].iov_len = server->vals->header_preamble_size; + rdata->iov[1].iov_base = buf + server->vals->header_preamble_size; + rdata->iov[1].iov_len = + server->total_read - server->vals->header_preamble_size; cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", rdata->iov[0].iov_base, rdata->iov[0].iov_len); cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", @@ -1713,6 +1715,7 @@ cifs_readv_callback(struct mid_q_entry *mid) .rq_npages = rdata->nr_pages, .rq_pagesz = rdata->pagesz, .rq_tailsz = rdata->tailsz }; + struct cifs_credits credits = { .value = 1, .instance = 0 }; cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", __func__, mid->mid, mid->mid_state, rdata->result, @@ -1750,7 +1753,7 @@ cifs_readv_callback(struct mid_q_entry *mid) queue_work(cifsiod_wq, &rdata->work); DeleteMidQEntry(mid); - add_credits(server, 1, 0); + add_credits(server, &credits, 0); } /* cifs_async_readv - send an async write, and set up mid to handle result */ @@ -1809,7 +1812,7 @@ cifs_async_readv(struct cifs_readdata *rdata) kref_get(&rdata->refcount); rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive, - cifs_readv_callback, NULL, rdata, 0); + cifs_readv_callback, NULL, rdata, 0, NULL); if (rc == 0) cifs_stats_inc(&tcon->stats.cifs_stats.num_reads); @@ -2123,14 +2126,18 @@ cifs_writev_requeue(struct cifs_writedata *wdata) wdata2->tailsz = tailsz; wdata2->bytes = cur_len; - wdata2->cfile = find_writable_file(CIFS_I(inode), false); + rc = cifs_get_writable_file(CIFS_I(inode), false, + &wdata2->cfile); if (!wdata2->cfile) { - cifs_dbg(VFS, "No writable handles for inode\n"); - rc = -EBADF; - break; + cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n", + rc); + if (!is_retryable_error(rc)) + rc = -EBADF; + } else { + wdata2->pid = wdata2->cfile->pid; + rc = server->ops->async_writev(wdata2, + cifs_writedata_release); } - wdata2->pid = wdata2->cfile->pid; - rc = server->ops->async_writev(wdata2, cifs_writedata_release); for (j = 0; j < nr_pages; j++) { unlock_page(wdata2->pages[j]); @@ -2145,6 +2152,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) kref_put(&wdata2->refcount, cifs_writedata_release); if (is_retryable_error(rc)) continue; + i += nr_pages; break; } @@ -2152,6 +2160,13 @@ cifs_writev_requeue(struct cifs_writedata *wdata) i += nr_pages; } while (i < wdata->nr_pages); + /* cleanup remaining pages from the original wdata */ + for (; i < wdata->nr_pages; i++) { + SetPageError(wdata->pages[i]); + end_page_writeback(wdata->pages[i]); + put_page(wdata->pages[i]); + } + if (rc != 0 && !is_retryable_error(rc)) mapping_set_error(inode->i_mapping, rc); kref_put(&wdata->refcount, cifs_writedata_release); @@ -2226,6 +2241,7 @@ cifs_writev_callback(struct mid_q_entry *mid) struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); unsigned int written; WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; + struct cifs_credits credits = { .value = 1, .instance = 0 }; switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -2261,7 +2277,7 @@ cifs_writev_callback(struct mid_q_entry *mid) queue_work(cifsiod_wq, &wdata->work); DeleteMidQEntry(mid); - add_credits(tcon->ses->server, 1, 0); + add_credits(tcon->ses->server, &credits, 0); } /* cifs_async_writev - send an async write, and set up mid to handle result */ @@ -2339,7 +2355,7 @@ cifs_async_writev(struct cifs_writedata *wdata, kref_get(&wdata->refcount); rc = cifs_call_async(tcon->ses->server, &rqst, NULL, - cifs_writev_callback, NULL, wdata, 0); + cifs_writev_callback, NULL, wdata, 0, NULL); if (rc == 0) cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8463c940e0e5..4c0e44489f21 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -102,8 +102,8 @@ enum { Opt_backupuid, Opt_backupgid, Opt_uid, Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, - Opt_rsize, Opt_wsize, Opt_actimeo, - Opt_echo_interval, Opt_max_credits, + Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo, + Opt_echo_interval, Opt_max_credits, Opt_handletimeout, Opt_snapshot, /* Mount options which take string value */ @@ -204,9 +204,11 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_dirmode, "dirmode=%s" }, { Opt_dirmode, "dir_mode=%s" }, { Opt_port, "port=%s" }, + { Opt_blocksize, "bsize=%s" }, { Opt_rsize, "rsize=%s" }, { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, + { Opt_handletimeout, "handletimeout=%s" }, { Opt_echo_interval, "echo_interval=%s" }, { Opt_max_credits, "max_credits=%s" }, { Opt_snapshot, "snapshot=%s" }, @@ -348,7 +350,7 @@ static int reconn_set_ipaddr(struct TCP_Server_Info *server) cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__); return -ENOMEM; } - snprintf(unc, len, "\\\\%s", server->hostname); + scnprintf(unc, len, "\\\\%s", server->hostname); rc = dns_resolve_server_name_to_ip(unc, &ipaddr); kfree(unc); @@ -592,6 +594,7 @@ cifs_reconnect(struct TCP_Server_Info *server) msleep(3000); } else { atomic_inc(&tcpSesReconnectCount); + set_credits(server, 1); spin_lock(&GlobalMid_Lock); if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; @@ -1053,7 +1056,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) } if (server->ops->is_status_pending && - server->ops->is_status_pending(buf, server, length)) + server->ops->is_status_pending(buf, server)) return -1; if (!mid) @@ -1063,6 +1066,26 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) return 0; } +static void +smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) +{ + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; + + /* + * SMB1 does not use credits. + */ + if (server->vals->header_preamble_size) + return; + + if (shdr->CreditRequest) { + spin_lock(&server->req_lock); + server->credits += le16_to_cpu(shdr->CreditRequest); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + } +} + + static int cifs_demultiplex_thread(void *p) { @@ -1169,10 +1192,6 @@ next_pdu: continue; } - if (server->large_buf) - buf = server->bigbuf; - - server->lstrp = jiffies; for (i = 0; i < num_mids; i++) { @@ -1192,6 +1211,7 @@ next_pdu: } else if (server->ops->is_oplock_break && server->ops->is_oplock_break(bufs[i], server)) { + smb2_add_credits_from_hdr(bufs[i], server); cifs_dbg(FYI, "Received oplock break\n"); } else { cifs_dbg(VFS, "No task to wake, unknown frame " @@ -1203,6 +1223,7 @@ next_pdu: if (server->ops->dump_detail) server->ops->dump_detail(bufs[i], server); + smb2_add_credits_from_hdr(bufs[i], server); cifs_dump_mids(server); #endif /* CIFS_DEBUG2 */ } @@ -1486,6 +1507,11 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol) const char *delims = "/\\"; size_t len; + if (unlikely(!devname || !*devname)) { + cifs_dbg(VFS, "Device name not specified.\n"); + return -EINVAL; + } + /* make sure we have a valid UNC double delimiter prefix */ len = strspn(devname, delims); if (len != 2) @@ -1571,7 +1597,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->cred_uid = current_uid(); vol->linux_uid = current_uid(); vol->linux_gid = current_gid(); - + vol->bsize = 1024 * 1024; /* can improve cp performance significantly */ /* * default to SFM style remapping of seven reserved characters * unless user overrides it or we negotiate CIFS POSIX where @@ -1594,6 +1620,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->actimeo = CIFS_DEF_ACTIMEO; + /* Most clients set timeout to 0, allows server to use its default */ + vol->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */ + /* offer SMB2.1 and later (SMB3 etc). Secure and widely accepted */ vol->ops = &smb30_operations; vol->vals = &smbdefault_values; @@ -1944,6 +1973,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } port = (unsigned short)option; break; + case Opt_blocksize: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid blocksize value\n", + __func__); + goto cifs_parse_mount_err; + } + /* + * inode blocksize realistically should never need to be + * less than 16K or greater than 16M and default is 1MB. + * Note that small inode block sizes (e.g. 64K) can lead + * to very poor performance of common tools like cp and scp + */ + if ((option < CIFS_MAX_MSGSIZE) || + (option > (4 * SMB3_DEFAULT_IOSIZE))) { + cifs_dbg(VFS, "%s: Invalid blocksize\n", + __func__); + goto cifs_parse_mount_err; + } + vol->bsize = option; + break; case Opt_rsize: if (get_option_ul(args, &option)) { cifs_dbg(VFS, "%s: Invalid rsize value\n", @@ -1972,6 +2021,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } break; + case Opt_handletimeout: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid handletimeout value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->handle_timeout = option; + if (vol->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) { + cifs_dbg(VFS, "Invalid handle cache timeout, longer than 16 minutes\n"); + goto cifs_parse_mount_err; + } + break; case Opt_echo_interval: if (get_option_ul(args, &option)) { cifs_dbg(VFS, "%s: Invalid echo interval value\n", @@ -2609,7 +2670,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); tcp_ses->session_estab = false; tcp_ses->sequence_number = 0; - tcp_ses->reconnect_instance = 0; + tcp_ses->reconnect_instance = 1; tcp_ses->lstrp = jiffies; spin_lock_init(&tcp_ses->req_lock); INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); @@ -2770,7 +2831,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info) if (tcon == NULL) return -ENOMEM; - snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname); + scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname); /* cannot fail */ nls_codepage = load_nls_default(); @@ -3138,6 +3199,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb_vol *volume_info) return 0; if (tcon->snapshot_time != volume_info->snapshot_time) return 0; + if (tcon->handle_timeout != volume_info->handle_timeout) + return 0; return 1; } @@ -3252,6 +3315,16 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->snapshot_time = volume_info->snapshot_time; } + if (volume_info->handle_timeout) { + if (ses->server->vals->protocol_id == 0) { + cifs_dbg(VFS, + "Use SMB2.1 or later for handle timeout option\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } else + tcon->handle_timeout = volume_info->handle_timeout; + } + tcon->ses = ses; if (volume_info->password) { tcon->password = kstrdup(volume_info->password, GFP_KERNEL); @@ -3839,6 +3912,7 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, spin_lock_init(&cifs_sb->tlink_tree_lock); cifs_sb->tlink_tree = RB_ROOT; + cifs_sb->bsize = pvolume_info->bsize; /* * Temporarily set r/wsize for matching superblock. If we end up using * new sb then client will later negotiate it downward if needed. @@ -4198,7 +4272,7 @@ static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it, new_unc = kmalloc(len, GFP_KERNEL); if (!new_unc) return -ENOMEM; - snprintf(new_unc, len, "\\%s", tgt); + scnprintf(new_unc, len, "\\%s", tgt); kfree(vol->UNC); vol->UNC = new_unc; @@ -4902,8 +4976,6 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses) if (!server->ops->need_neg(server)) return 0; - set_credits(server, 1); - rc = server->ops->negotiate(xid, ses); if (rc == 0) { spin_lock(&GlobalMid_Lock); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 907e85d65bb4..f26a48dd2e39 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -621,20 +621,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, { int rc = -EPERM; unsigned int xid; - int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; - struct cifs_io_parms io_parms; char *full_path = NULL; - struct inode *newinode = NULL; - __u32 oplock = 0; - struct cifs_fid fid; - struct cifs_open_parms oparms; - FILE_ALL_INFO *buf = NULL; - unsigned int bytes_written; - struct win_dev *pdev; - struct kvec iov[2]; if (!old_valid_dev(device_number)) return -EINVAL; @@ -654,103 +644,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, goto mknod_out; } - if (tcon->unix_ext) { - struct cifs_unix_set_info_args args = { - .mode = mode & ~current_umask(), - .ctime = NO_CHANGE_64, - .atime = NO_CHANGE_64, - .mtime = NO_CHANGE_64, - .device = device_number, - }; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = current_fsuid(); - args.gid = current_fsgid(); - } else { - args.uid = INVALID_UID; /* no change */ - args.gid = INVALID_GID; /* no change */ - } - rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); - if (rc) - goto mknod_out; - - rc = cifs_get_inode_info_unix(&newinode, full_path, - inode->i_sb, xid); - - if (rc == 0) - d_instantiate(direntry, newinode); - goto mknod_out; - } - - if (!S_ISCHR(mode) && !S_ISBLK(mode)) - goto mknod_out; - - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - goto mknod_out; - - - cifs_dbg(FYI, "sfu compat create special file\n"); - - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto mknod_out; - } - - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = create_options; - oparms.disposition = FILE_CREATE; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; - - if (tcon->ses->server->oplocks) - oplock = REQ_OPLOCK; - else - oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); - if (rc) - goto mknod_out; - - /* - * BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely. - */ - - pdev = (struct win_dev *)buf; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = buf; - iov[1].iov_len = sizeof(struct win_dev); - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = cpu_to_le64(MAJOR(device_number)); - pdev->minor = cpu_to_le64(MINOR(device_number)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = cpu_to_le64(MAJOR(device_number)); - pdev->minor = cpu_to_le64(MINOR(device_number)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } - tcon->ses->server->ops->close(xid, tcon, &fid); - d_drop(direntry); - - /* FIXME: add code here to set EAs */ + rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon, + full_path, mode, + device_number); mknod_out: kfree(full_path); - kfree(buf); free_xid(xid); cifs_put_tlink(tlink); return rc; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 659ce1b92c44..7037a137fa53 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -360,13 +360,31 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) return cifs_file; } -/* - * Release a reference on the file private data. This may involve closing - * the filehandle out on the server. Must be called without holding - * tcon->open_file_lock and cifs_file->file_info_lock. +/** + * cifsFileInfo_put - release a reference of file priv data + * + * Always potentially wait for oplock handler. See _cifsFileInfo_put(). */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { + _cifsFileInfo_put(cifs_file, true); +} + +/** + * _cifsFileInfo_put - release a reference of file priv data + * + * This may involve closing the filehandle @cifs_file out on the + * server. Must be called without holding tcon->open_file_lock and + * cifs_file->file_info_lock. + * + * If @wait_for_oplock_handler is true and we are releasing the last + * reference, wait for any running oplock break handler of the file + * and cancel any pending one. If calling this function from the + * oplock break handler, you need to pass false. + * + */ +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) +{ struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -414,7 +432,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) spin_unlock(&tcon->open_file_lock); - oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); + oplock_break_cancelled = wait_oplock_handler ? + cancel_work_sync(&cifs_file->oplock_break) : false; if (!tcon->need_reconnect && !cifs_file->invalidHandle) { struct TCP_Server_Info *server = tcon->ses->server; @@ -1645,8 +1664,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if (flock->fl_flags & FL_POSIX && !rc) + if (flock->fl_flags & FL_POSIX) { + /* + * If this is a request to remove all locks because we + * are closing the file, it doesn't matter if the + * unlocking failed as both cifs.ko and the SMB server + * remove the lock on file close + */ + if (rc) { + cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc); + if (!(flock->fl_flags & FL_CLOSE)) + return rc; + } rc = locks_lock_file_wait(file, flock); + } return rc; } @@ -1842,24 +1873,30 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, return NULL; } -struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, - bool fsuid_only) +/* Return -EBADF if no handle is found and general rc otherwise */ +int +cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, + struct cifsFileInfo **ret_file) { struct cifsFileInfo *open_file, *inv_file = NULL; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; bool any_available = false; - int rc; + int rc = -EBADF; unsigned int refind = 0; - /* Having a null inode here (because mapping->host was set to zero by - the VFS or MM) should not happen but we had reports of on oops (due to - it being zero) during stress testcases so we need to check for it */ + *ret_file = NULL; + + /* + * Having a null inode here (because mapping->host was set to zero by + * the VFS or MM) should not happen but we had reports of on oops (due + * to it being zero) during stress testcases so we need to check for it + */ if (cifs_inode == NULL) { cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n"); dump_stack(); - return NULL; + return rc; } cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); @@ -1873,7 +1910,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, refind_writable: if (refind > MAX_REOPEN_ATT) { spin_unlock(&tcon->open_file_lock); - return NULL; + return rc; } list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (!any_available && open_file->pid != current->tgid) @@ -1885,7 +1922,8 @@ refind_writable: /* found a good writable file */ cifsFileInfo_get(open_file); spin_unlock(&tcon->open_file_lock); - return open_file; + *ret_file = open_file; + return 0; } else { if (!inv_file) inv_file = open_file; @@ -1907,22 +1945,35 @@ refind_writable: if (inv_file) { rc = cifs_reopen_file(inv_file, false); - if (!rc) - return inv_file; - else { - spin_lock(&tcon->open_file_lock); - list_move_tail(&inv_file->flist, - &cifs_inode->openFileList); - spin_unlock(&tcon->open_file_lock); - cifsFileInfo_put(inv_file); - ++refind; - inv_file = NULL; - spin_lock(&tcon->open_file_lock); - goto refind_writable; + if (!rc) { + *ret_file = inv_file; + return 0; } + + spin_lock(&tcon->open_file_lock); + list_move_tail(&inv_file->flist, &cifs_inode->openFileList); + spin_unlock(&tcon->open_file_lock); + cifsFileInfo_put(inv_file); + ++refind; + inv_file = NULL; + spin_lock(&tcon->open_file_lock); + goto refind_writable; } - return NULL; + return rc; +} + +struct cifsFileInfo * +find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) +{ + struct cifsFileInfo *cfile; + int rc; + + rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile); + if (rc) + cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc); + + return cfile; } static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) @@ -1959,8 +2010,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) if (mapping->host->i_size - offset < (loff_t)to) to = (unsigned)(mapping->host->i_size - offset); - open_file = find_writable_file(CIFS_I(mapping->host), false); - if (open_file) { + rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file); + if (!rc) { bytes_written = cifs_write(open_file, open_file->pid, write_data, to - from, &offset); cifsFileInfo_put(open_file); @@ -1970,9 +2021,12 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) rc = 0; else if (bytes_written < 0) rc = bytes_written; + else + rc = -EFAULT; } else { - cifs_dbg(FYI, "No writeable filehandles for inode\n"); - rc = -EIO; + cifs_dbg(FYI, "No writable handle for write page rc=%d\n", rc); + if (!is_retryable_error(rc)) + rc = -EIO; } kunmap(page); @@ -2079,9 +2133,9 @@ static int wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, struct address_space *mapping, struct writeback_control *wbc) { - int rc = 0; - struct TCP_Server_Info *server; - unsigned int i; + int rc; + struct TCP_Server_Info *server = + tlink_tcon(wdata->cfile->tlink)->ses->server; wdata->sync_mode = wbc->sync_mode; wdata->nr_pages = nr_pages; @@ -2091,21 +2145,16 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, page_offset(wdata->pages[nr_pages - 1]), (loff_t)PAGE_SIZE); wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz; + wdata->pid = wdata->cfile->pid; - if (wdata->cfile != NULL) - cifsFileInfo_put(wdata->cfile); - wdata->cfile = find_writable_file(CIFS_I(mapping->host), false); - if (!wdata->cfile) { - cifs_dbg(VFS, "No writable handles for inode\n"); - rc = -EBADF; - } else { - wdata->pid = wdata->cfile->pid; - server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata, cifs_writedata_release); - } + rc = adjust_credits(server, &wdata->credits, wdata->bytes); + if (rc) + return rc; - for (i = 0; i < nr_pages; ++i) - unlock_page(wdata->pages[i]); + if (wdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_writev(wdata, cifs_writedata_release); return rc; } @@ -2113,11 +2162,13 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb); + struct inode *inode = mapping->host; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct TCP_Server_Info *server; bool done = false, scanned = false, range_whole = false; pgoff_t end, index; struct cifs_writedata *wdata; + struct cifsFileInfo *cfile = NULL; int rc = 0; int saved_rc = 0; unsigned int xid; @@ -2143,11 +2194,23 @@ static int cifs_writepages(struct address_space *mapping, server = cifs_sb_master_tcon(cifs_sb)->ses->server; retry: while (!done && index <= end) { - unsigned int i, nr_pages, found_pages, wsize, credits; + unsigned int i, nr_pages, found_pages, wsize; pgoff_t next = 0, tofind, saved_index = index; + struct cifs_credits credits_on_stack; + struct cifs_credits *credits = &credits_on_stack; + int get_file_rc = 0; + + if (cfile) + cifsFileInfo_put(cfile); + + rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile); + + /* in case of an error store it to return later */ + if (rc) + get_file_rc = rc; rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, - &wsize, &credits); + &wsize, credits); if (rc != 0) { done = true; break; @@ -2180,13 +2243,26 @@ retry: continue; } - wdata->credits = credits; + wdata->credits = credits_on_stack; + wdata->cfile = cfile; + cfile = NULL; + + if (!wdata->cfile) { + cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", + get_file_rc); + if (is_retryable_error(get_file_rc)) + rc = get_file_rc; + else + rc = -EBADF; + } else + rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); - rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); + for (i = 0; i < nr_pages; ++i) + unlock_page(wdata->pages[i]); /* send failure -- clean up the mess */ if (rc != 0) { - add_credits_and_wake_if(server, wdata->credits, 0); + add_credits_and_wake_if(server, &wdata->credits, 0); for (i = 0; i < nr_pages; ++i) { if (is_retryable_error(rc)) redirty_page_for_writepage(wbc, @@ -2238,6 +2314,8 @@ retry: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; + if (cfile) + cifsFileInfo_put(cfile); free_xid(xid); return rc; } @@ -2567,47 +2645,62 @@ static int cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, struct cifs_aio_ctx *ctx) { - unsigned int wsize, credits; + unsigned int wsize; + struct cifs_credits credits; int rc; struct TCP_Server_Info *server = tlink_tcon(wdata->cfile->tlink)->ses->server; - /* - * Wait for credits to resend this wdata. - * Note: we are attempting to resend the whole wdata not in segments - */ do { - rc = server->ops->wait_mtu_credits( - server, wdata->bytes, &wsize, &credits); + if (wdata->cfile->invalidHandle) { + rc = cifs_reopen_file(wdata->cfile, false); + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } - if (rc) - goto out; - if (wsize < wdata->bytes) { - add_credits_and_wake_if(server, credits, 0); - msleep(1000); - } - } while (wsize < wdata->bytes); + /* + * Wait for credits to resend this wdata. + * Note: we are attempting to resend the whole wdata not in + * segments + */ + do { + rc = server->ops->wait_mtu_credits(server, wdata->bytes, + &wsize, &credits); + if (rc) + goto fail; + + if (wsize < wdata->bytes) { + add_credits_and_wake_if(server, &credits, 0); + msleep(1000); + } + } while (wsize < wdata->bytes); + wdata->credits = credits; - rc = -EAGAIN; - while (rc == -EAGAIN) { - rc = 0; - if (wdata->cfile->invalidHandle) - rc = cifs_reopen_file(wdata->cfile, false); - if (!rc) - rc = server->ops->async_writev(wdata, + rc = adjust_credits(server, &wdata->credits, wdata->bytes); + + if (!rc) { + if (wdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_writev(wdata, cifs_uncached_writedata_release); - } + } - if (!rc) { - list_add_tail(&wdata->list, wdata_list); - return 0; - } + /* If the write was successfully sent, we are done */ + if (!rc) { + list_add_tail(&wdata->list, wdata_list); + return 0; + } - add_credits_and_wake_if(server, wdata->credits, 0); -out: - kref_put(&wdata->refcount, cifs_uncached_writedata_release); + /* Roll back credits and retry if needed */ + add_credits_and_wake_if(server, &wdata->credits, 0); + } while (rc == -EAGAIN); +fail: + kref_put(&wdata->refcount, cifs_uncached_writedata_release); return rc; } @@ -2627,6 +2720,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, struct TCP_Server_Info *server; struct page **pagevec; size_t start; + unsigned int xid; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -2634,12 +2728,23 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, pid = current->tgid; server = tlink_tcon(open_file->tlink)->ses->server; + xid = get_xid(); do { - unsigned int wsize, credits; + unsigned int wsize; + struct cifs_credits credits_on_stack; + struct cifs_credits *credits = &credits_on_stack; + + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, false); + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, - &wsize, &credits); + &wsize, credits); if (rc) break; @@ -2731,16 +2836,22 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, wdata->pid = pid; wdata->bytes = cur_len; wdata->pagesz = PAGE_SIZE; - wdata->credits = credits; + wdata->credits = credits_on_stack; wdata->ctx = ctx; kref_get(&ctx->refcount); - if (!wdata->cfile->invalidHandle || - !(rc = cifs_reopen_file(wdata->cfile, false))) - rc = server->ops->async_writev(wdata, + rc = adjust_credits(server, &wdata->credits, wdata->bytes); + + if (!rc) { + if (wdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_writev(wdata, cifs_uncached_writedata_release); + } + if (rc) { - add_credits_and_wake_if(server, wdata->credits, 0); + add_credits_and_wake_if(server, &wdata->credits, 0); kref_put(&wdata->refcount, cifs_uncached_writedata_release); if (rc == -EAGAIN) { @@ -2756,6 +2867,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, len -= cur_len; } while (len > 0); + free_xid(xid); return rc; } @@ -2765,7 +2877,6 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; struct dentry *dentry = ctx->cfile->dentry; - unsigned int i; int rc; tcon = tlink_tcon(ctx->cfile->tlink); @@ -2816,12 +2927,12 @@ restart_loop: wdata->bytes, &tmp_from, ctx->cfile, cifs_sb, &tmp_list, ctx); + + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); } list_splice(&tmp_list, &ctx->list); - - kref_put(&wdata->refcount, - cifs_uncached_writedata_release); goto restart_loop; } } @@ -2829,10 +2940,6 @@ restart_loop: kref_put(&wdata->refcount, cifs_uncached_writedata_release); } - if (!ctx->direct_io) - for (i = 0; i < ctx->npages; i++) - put_page(ctx->bv[i].bv_page); - cifs_stats_bytes_written(tcon, ctx->total_len); set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags); @@ -3028,14 +3135,16 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from) * these pages but not on the region from pos to ppos+len-1. */ written = cifs_user_writev(iocb, from); - if (written > 0 && CIFS_CACHE_READ(cinode)) { + if (CIFS_CACHE_READ(cinode)) { /* - * Windows 7 server can delay breaking level2 oplock if a write - * request comes - break it on the client to prevent reading - * an old data. + * We have read level caching and we have just sent a write + * request to the server thus making data in the cache stale. + * Zap the cache and set oplock/lease level to NONE to avoid + * reading stale data from the cache. All subsequent read + * operations will read new data from the server. */ cifs_zap_mapping(inode); - cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", + cifs_dbg(FYI, "Set Oplock/Lease to NONE for inode=%p after write\n", inode); cinode->oplock = 0; } @@ -3260,48 +3369,61 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata, struct list_head *rdata_list, struct cifs_aio_ctx *ctx) { - unsigned int rsize, credits; + unsigned int rsize; + struct cifs_credits credits; int rc; struct TCP_Server_Info *server = tlink_tcon(rdata->cfile->tlink)->ses->server; - /* - * Wait for credits to resend this rdata. - * Note: we are attempting to resend the whole rdata not in segments - */ do { - rc = server->ops->wait_mtu_credits(server, rdata->bytes, + if (rdata->cfile->invalidHandle) { + rc = cifs_reopen_file(rdata->cfile, true); + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } + + /* + * Wait for credits to resend this rdata. + * Note: we are attempting to resend the whole rdata not in + * segments + */ + do { + rc = server->ops->wait_mtu_credits(server, rdata->bytes, &rsize, &credits); - if (rc) - goto out; + if (rc) + goto fail; - if (rsize < rdata->bytes) { - add_credits_and_wake_if(server, credits, 0); - msleep(1000); - } - } while (rsize < rdata->bytes); + if (rsize < rdata->bytes) { + add_credits_and_wake_if(server, &credits, 0); + msleep(1000); + } + } while (rsize < rdata->bytes); + rdata->credits = credits; - rc = -EAGAIN; - while (rc == -EAGAIN) { - rc = 0; - if (rdata->cfile->invalidHandle) - rc = cifs_reopen_file(rdata->cfile, true); - if (!rc) - rc = server->ops->async_readv(rdata); - } + rc = adjust_credits(server, &rdata->credits, rdata->bytes); + if (!rc) { + if (rdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_readv(rdata); + } - if (!rc) { - /* Add to aio pending list */ - list_add_tail(&rdata->list, rdata_list); - return 0; - } + /* If the read was successfully sent, we are done */ + if (!rc) { + /* Add to aio pending list */ + list_add_tail(&rdata->list, rdata_list); + return 0; + } - add_credits_and_wake_if(server, rdata->credits, 0); -out: - kref_put(&rdata->refcount, - cifs_uncached_readdata_release); + /* Roll back credits and retry if needed */ + add_credits_and_wake_if(server, &rdata->credits, 0); + } while (rc == -EAGAIN); +fail: + kref_put(&rdata->refcount, cifs_uncached_readdata_release); return rc; } @@ -3311,7 +3433,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, struct cifs_aio_ctx *ctx) { struct cifs_readdata *rdata; - unsigned int npages, rsize, credits; + unsigned int npages, rsize; + struct cifs_credits credits_on_stack; + struct cifs_credits *credits = &credits_on_stack; size_t cur_len; int rc; pid_t pid; @@ -3331,8 +3455,16 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, iov_iter_advance(&direct_iov, offset - ctx->pos); do { + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, true); + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, - &rsize, &credits); + &rsize, credits); if (rc) break; @@ -3406,15 +3538,21 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->pagesz = PAGE_SIZE; rdata->read_into_pages = cifs_uncached_read_into_pages; rdata->copy_into_pages = cifs_uncached_copy_into_pages; - rdata->credits = credits; + rdata->credits = credits_on_stack; rdata->ctx = ctx; kref_get(&ctx->refcount); - if (!rdata->cfile->invalidHandle || - !(rc = cifs_reopen_file(rdata->cfile, true))) - rc = server->ops->async_readv(rdata); + rc = adjust_credits(server, &rdata->credits, rdata->bytes); + + if (!rc) { + if (rdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_readv(rdata); + } + if (rc) { - add_credits_and_wake_if(server, rdata->credits, 0); + add_credits_and_wake_if(server, &rdata->credits, 0); kref_put(&rdata->refcount, cifs_uncached_readdata_release); if (rc == -EAGAIN) { @@ -3439,7 +3577,6 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx) struct iov_iter *to = &ctx->iter; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - unsigned int i; int rc; tcon = tlink_tcon(ctx->cfile->tlink); @@ -3523,17 +3660,8 @@ again: kref_put(&rdata->refcount, cifs_uncached_readdata_release); } - if (!ctx->direct_io) { - for (i = 0; i < ctx->npages; i++) { - if (ctx->should_dirty) - set_page_dirty(ctx->bv[i].bv_page); - put_page(ctx->bv[i].bv_page); - } - + if (!ctx->direct_io) ctx->total_len = ctx->len - iov_iter_count(to); - } - - cifs_stats_bytes_read(tcon, ctx->total_len); /* mask nodata case */ if (rc == -ENODATA) @@ -4095,10 +4223,19 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, loff_t offset; struct page *page, *tpage; struct cifs_readdata *rdata; - unsigned credits; + struct cifs_credits credits_on_stack; + struct cifs_credits *credits = &credits_on_stack; + + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, true); + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, - &rsize, &credits); + &rsize, credits); if (rc) break; @@ -4144,18 +4281,24 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata->tailsz = PAGE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; rdata->copy_into_pages = cifs_readpages_copy_into_pages; - rdata->credits = credits; + rdata->credits = credits_on_stack; list_for_each_entry_safe(page, tpage, &tmplist, lru) { list_del(&page->lru); rdata->pages[rdata->nr_pages++] = page; } - if (!rdata->cfile->invalidHandle || - !(rc = cifs_reopen_file(rdata->cfile, true))) - rc = server->ops->async_readv(rdata); + rc = adjust_credits(server, &rdata->credits, rdata->bytes); + + if (!rc) { + if (rdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_readv(rdata); + } + if (rc) { - add_credits_and_wake_if(server, rdata->credits, 0); + add_credits_and_wake_if(server, &rdata->credits, 0); for (i = 0; i < rdata->nr_pages; i++) { page = rdata->pages[i]; lru_cache_add_file(page); @@ -4466,6 +4609,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + _cifsFileInfo_put(cfile, false /* do not wait for ourself */); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 478003644916..538fd7d807e4 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1735,6 +1735,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (rc == 0 || rc != -EBUSY) goto do_rename_exit; + /* Don't fall back to using SMB on SMB 2+ mount */ + if (server->vals->protocol_id != 0) + goto do_rename_exit; + /* open-file renames don't work across directories */ if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; @@ -2080,7 +2084,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat, return rc; generic_fillattr(inode, stat); - stat->blksize = CIFS_MAX_MSGSIZE; + stat->blksize = cifs_sb->bsize; stat->ino = CIFS_I(inode)->uniqueid; /* old CIFS Unix Extensions doesn't return create time */ diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 2148b0f60e5e..62216dc8f9f5 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -103,9 +103,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, return rc; } - snprintf(md5_str2, sizeof(md5_str2), - CIFS_MF_SYMLINK_MD5_FORMAT, - CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); + scnprintf(md5_str2, sizeof(md5_str2), + CIFS_MF_SYMLINK_MD5_FORMAT, + CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); if (strncmp(md5_str1, md5_str2, 17) != 0) return -EINVAL; @@ -142,10 +142,10 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str) return rc; } - snprintf(buf, buf_len, - CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, - link_len, - CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); + scnprintf(buf, buf_len, + CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, + link_len, + CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); ofs = CIFS_MF_SYMLINK_LINK_OFFSET; memcpy(buf + ofs, link_str, link_len); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index bee203055b30..b1a696a73f7c 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -501,8 +501,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &pCifsInode->flags); - queue_work(cifsoplockd_wq, - &netfile->oplock_break); + cifs_queue_oplock_break(netfile); netfile->oplock_break_cancelled = false; spin_unlock(&tcon->open_file_lock); @@ -607,6 +606,28 @@ void cifs_put_writer(struct cifsInodeInfo *cinode) spin_unlock(&cinode->writers_lock); } +/** + * cifs_queue_oplock_break - queue the oplock break handler for cfile + * + * This function is called from the demultiplex thread when it + * receives an oplock break for @cfile. + * + * Assumes the tcon->open_file_lock is held. + * Assumes cfile->file_info_lock is NOT held. + */ +void cifs_queue_oplock_break(struct cifsFileInfo *cfile) +{ + /* + * Bump the handle refcount now while we hold the + * open_file_lock to enforce the validity of it for the oplock + * break handler. The matching put is done at the end of the + * handler. + */ + cifsFileInfo_get(cfile); + + queue_work(cifsoplockd_wq, &cfile->oplock_break); +} + void cifs_done_oplock_break(struct cifsInodeInfo *cinode) { clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); @@ -768,6 +789,11 @@ cifs_aio_ctx_alloc(void) { struct cifs_aio_ctx *ctx; + /* + * Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io + * to false so that we know when we have to unreference pages within + * cifs_aio_ctx_release() + */ ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL); if (!ctx) return NULL; @@ -786,7 +812,23 @@ cifs_aio_ctx_release(struct kref *refcount) struct cifs_aio_ctx, refcount); cifsFileInfo_put(ctx->cfile); - kvfree(ctx->bv); + + /* + * ctx->bv is only set if setup_aio_ctx_iter() was call successfuly + * which means that iov_iter_get_pages() was a success and thus that + * we have taken reference on pages. + */ + if (ctx->bv) { + unsigned i; + + for (i = 0; i < ctx->npages; i++) { + if (ctx->should_dirty) + set_page_dirty(ctx->bv[i].bv_page); + put_page(ctx->bv[i].bv_page); + } + kvfree(ctx->bv); + } + kfree(ctx); } @@ -917,7 +959,6 @@ cifs_alloc_hash(const char *name, } (*sdesc)->shash.tfm = *shash; - (*sdesc)->shash.flags = 0x0; return 0; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 32a6c020478f..c711f1f39bf2 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -117,11 +117,11 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer) } static void -cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add, - const int optype) +cifs_add_credits(struct TCP_Server_Info *server, + const struct cifs_credits *credits, const int optype) { spin_lock(&server->req_lock); - server->credits += add; + server->credits += credits->value; server->in_flight--; spin_unlock(&server->req_lock); wake_up(&server->request_q); @@ -308,7 +308,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) remaining = tgt_total_cnt - total_in_tgt; if (remaining < 0) { - cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n", + cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%u\n", tgt_total_cnt, total_in_tgt); return -EPROTO; } @@ -1027,6 +1027,131 @@ cifs_can_echo(struct TCP_Server_Info *server) return false; } +static int +cifs_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct inode *newinode = NULL; + int rc = -EPERM; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; + FILE_ALL_INFO *buf = NULL; + struct cifs_io_parms io_parms; + __u32 oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + unsigned int bytes_written; + struct win_dev *pdev; + struct kvec iov[2]; + + if (tcon->unix_ext) { + /* + * SMB1 Unix Extensions: requires server support but + * works with all special files + */ + struct cifs_unix_set_info_args args = { + .mode = mode & ~current_umask(), + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = dev, + }; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = current_fsuid(); + args.gid = current_fsgid(); + } else { + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ + } + rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + if (rc) + goto out; + + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + + if (rc == 0) + d_instantiate(dentry, newinode); + goto out; + } + + /* + * SMB1 SFU emulation: should work with all servers, but only + * support block and char device (no socket & fifo) + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto out; + + if (!S_ISCHR(mode) && !S_ISBLK(mode)) + goto out; + + cifs_dbg(FYI, "sfu compat create special file\n"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + if (rc) + goto out; + + /* + * BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely. + */ + + pdev = (struct win_dev *)buf; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = sizeof(struct win_dev); + iov[1].iov_base = buf; + iov[1].iov_len = sizeof(struct win_dev); + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } + tcon->ses->server->ops->close(xid, tcon, &fid); + d_drop(dentry); + + /* FIXME: add code here to set EAs */ +out: + kfree(buf); + return rc; +} + + + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1110,6 +1235,7 @@ struct smb_version_operations smb1_operations = { .get_acl_by_fid = get_cifs_acl_by_fid, .set_acl = set_cifs_acl, #endif /* CIFS_ACL */ + .make_node = cifs_make_node, }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index b204e84b87fb..54bffb2a1786 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -68,13 +68,15 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, if (oparms->tcon->use_resilient) { - nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */ + /* default timeout is 0, servers pick default (120 seconds) */ + nr_ioctl_req.Timeout = + cpu_to_le32(oparms->tcon->handle_timeout); nr_ioctl_req.Reserved = 0; rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid, fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, true /* is_fsctl */, (char *)&nr_ioctl_req, sizeof(nr_ioctl_req), - NULL, NULL /* no return info */); + CIFSMaxBufSize, NULL, NULL /* no return info */); if (rc == -EOPNOTSUPP) { cifs_dbg(VFS, "resiliency not supported by server, disabling\n"); diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 01a76bccdb8d..278405d26c47 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -37,6 +37,16 @@ #include "smb2pdu.h" #include "smb2proto.h" +static void +free_set_inf_compound(struct smb_rqst *rqst) +{ + if (rqst[1].rq_iov) + SMB2_set_info_free(&rqst[1]); + if (rqst[2].rq_iov) + SMB2_close_free(&rqst[2]); +} + + static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, @@ -112,14 +122,18 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, PATH_MAX * 2, 0, NULL); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid, + full_path); break; case SMB2_OP_DELETE: + trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_MKDIR: /* * Directories are created through parameters in the * SMB2_open() call. */ + trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_RMDIR: memset(&si_iov, 0, sizeof(si_iov)); @@ -135,6 +149,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_SET_EOF: memset(&si_iov, 0, sizeof(si_iov)); @@ -150,6 +165,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_SET_INFO: memset(&si_iov, 0, sizeof(si_iov)); @@ -166,6 +182,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid, + full_path); break; case SMB2_OP_RENAME: memset(&si_iov, 0, sizeof(si_iov)); @@ -190,6 +208,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_HARDLINK: memset(&si_iov, 0, sizeof(si_iov)); @@ -214,6 +233,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path); break; default: cifs_dbg(VFS, "Invalid command\n"); @@ -252,21 +272,65 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_query_info_free(&rqst[1]); if (rqst[2].rq_iov) SMB2_close_free(&rqst[2]); + if (rc) + trace_smb3_query_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_query_info_compound_done(xid, ses->Suid, + tcon->tid); break; case SMB2_OP_DELETE: + if (rc) + trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_delete_done(xid, ses->Suid, tcon->tid); + if (rqst[1].rq_iov) + SMB2_close_free(&rqst[1]); + break; case SMB2_OP_MKDIR: + if (rc) + trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid); if (rqst[1].rq_iov) SMB2_close_free(&rqst[1]); break; case SMB2_OP_HARDLINK: + if (rc) + trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_RENAME: + if (rc) + trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rename_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_RMDIR: + if (rc) + trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_SET_EOF: + if (rc) + trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_SET_INFO: - if (rqst[1].rq_iov) - SMB2_set_info_free(&rqst[1]); - if (rqst[2].rq_iov) - SMB2_close_free(&rqst[2]); + if (rc) + trace_smb3_set_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_set_info_compound_done(xid, ses->Suid, + tcon->tid); + free_set_inf_compound(rqst); break; } free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); @@ -309,12 +373,17 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, rc = open_shroot(xid, tcon, &fid); if (rc) goto out; - rc = SMB2_query_info(xid, tcon, fid.persistent_fid, - fid.volatile_fid, smb2_data); + + if (tcon->crfid.file_all_info_is_valid) { + move_smb2_info_to_cifs(data, + &tcon->crfid.file_all_info); + } else { + rc = SMB2_query_info(xid, tcon, fid.persistent_fid, + fid.volatile_fid, smb2_data); + if (!rc) + move_smb2_info_to_cifs(data, smb2_data); + } close_shroot(&tcon->crfid); - if (rc) - goto out; - move_smb2_info_to_cifs(data, smb2_data); goto out; } diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 924269cec135..e32c264e3adb 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -1036,7 +1036,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_UNFINISHED_CONTEXT_DELETED, -EIO, "STATUS_UNFINISHED_CONTEXT_DELETED"}, {STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"}, - {STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"}, + /* Note that ENOATTTR and ENODATA are the same errno */ + {STATUS_OBJECTID_NOT_FOUND, -ENODATA, "STATUS_OBJECTID_NOT_FOUND"}, {STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"}, {STATUS_WRONG_CREDENTIAL_HANDLE, -EIO, "STATUS_WRONG_CREDENTIAL_HANDLE"}, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 7b8b58fb4d3f..e311f58dc1c8 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -517,7 +517,6 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, __u8 lease_state; struct list_head *tmp; struct cifsFileInfo *cfile; - struct TCP_Server_Info *server = tcon->ses->server; struct cifs_pending_open *open; struct cifsInodeInfo *cinode; int ack_req = le32_to_cpu(rsp->Flags & @@ -537,14 +536,26 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, cifs_dbg(FYI, "lease key match, lease break 0x%x\n", le32_to_cpu(rsp->NewLeaseState)); - server->ops->set_oplock_level(cinode, lease_state, 0, NULL); - if (ack_req) cfile->oplock_break_cancelled = false; else cfile->oplock_break_cancelled = true; - queue_work(cifsoplockd_wq, &cfile->oplock_break); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + + /* + * Set or clear flags depending on the lease state being READ. + * HANDLE caching flag should be added when the client starts + * to defer closing remote file handles with HANDLE leases. + */ + if (lease_state & SMB2_LEASE_READ_CACHING_HE) + set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + + cifs_queue_oplock_break(cfile); kfree(lw); return true; } @@ -648,13 +659,6 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) return false; - if (rsp->sync_hdr.CreditRequest) { - spin_lock(&server->req_lock); - server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest); - spin_unlock(&server->req_lock); - wake_up(&server->request_q); - } - if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) @@ -708,8 +712,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); spin_unlock(&cfile->file_info_lock); - queue_work(cifsoplockd_wq, - &cfile->oplock_break); + + cifs_queue_oplock_break(cfile); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 6f96e2292856..c36ff0d1fe2a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -67,10 +67,13 @@ change_conf(struct TCP_Server_Info *server) } static void -smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, - const int optype) +smb2_add_credits(struct TCP_Server_Info *server, + const struct cifs_credits *credits, const int optype) { int *val, rc = -1; + unsigned int add = credits->value; + unsigned int instance = credits->instance; + bool reconnect_detected = false; spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); @@ -79,8 +82,11 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0)) trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, server->hostname, *val); + if ((instance == 0) || (instance == server->reconnect_instance)) + *val += add; + else + reconnect_detected = true; - *val += add; if (*val > 65000) { *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ printk_once(KERN_WARNING "server overflowed SMB3 credits\n"); @@ -102,7 +108,12 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, spin_unlock(&server->req_lock); wake_up(&server->request_q); - if (server->tcpStatus == CifsNeedReconnect) + if (reconnect_detected) + cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n", + add, instance); + + if (server->tcpStatus == CifsNeedReconnect + || server->tcpStatus == CifsExiting) return; switch (rc) { @@ -163,7 +174,7 @@ smb2_get_credits(struct mid_q_entry *mid) static int smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, - unsigned int *num, unsigned int *credits) + unsigned int *num, struct cifs_credits *credits) { int rc = 0; unsigned int scredits; @@ -174,7 +185,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); rc = wait_event_killable(server->request_q, - has_credits(server, &server->credits)); + has_credits(server, &server->credits, 1)); cifs_num_waiters_dec(server); if (rc) return rc; @@ -189,7 +200,8 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, /* can deadlock with reopen */ if (scredits <= 8) { *num = SMB2_MAX_BUFFER_SIZE; - *credits = 0; + credits->value = 0; + credits->instance = 0; break; } @@ -198,8 +210,10 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, *num = min_t(unsigned int, size, scredits * SMB2_MAX_BUFFER_SIZE); - *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); - server->credits -= *credits; + credits->value = + DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); + credits->instance = server->reconnect_instance; + server->credits -= credits->value; server->in_flight++; break; } @@ -208,6 +222,38 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, return rc; } +static int +smb2_adjust_credits(struct TCP_Server_Info *server, + struct cifs_credits *credits, + const unsigned int payload_size) +{ + int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE); + + if (!credits->value || credits->value == new_val) + return 0; + + if (credits->value < new_val) { + WARN_ONCE(1, "request has less credits (%d) than required (%d)", + credits->value, new_val); + return -ENOTSUPP; + } + + spin_lock(&server->req_lock); + + if (server->reconnect_instance != credits->instance) { + spin_unlock(&server->req_lock); + cifs_dbg(VFS, "trying to return %d credits to old session\n", + credits->value - new_val); + return -EAGAIN; + } + + server->credits += credits->value - new_val; + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + credits->value = new_val; + return 0; +} + static __u64 smb2_get_next_mid(struct TCP_Server_Info *server) { @@ -219,6 +265,15 @@ smb2_get_next_mid(struct TCP_Server_Info *server) return mid; } +static void +smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) +{ + spin_lock(&GlobalMid_Lock); + if (server->CurrentMid >= val) + server->CurrentMid -= val; + spin_unlock(&GlobalMid_Lock); +} + static struct mid_q_entry * smb2_find_mid(struct TCP_Server_Info *server, char *buf) { @@ -526,7 +581,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, NULL /* no data input */, 0 /* no data input */, - (char **)&out_buf, &ret_data_len); + CIFSMaxBufSize, (char **)&out_buf, &ret_data_len); if (rc == -EOPNOTSUPP) { cifs_dbg(FYI, "server does not support query network interfaces\n"); @@ -564,6 +619,7 @@ smb2_close_cached_fid(struct kref *ref) SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid, cfid->fid->volatile_fid); cfid->is_valid = false; + cfid->file_all_info_is_valid = false; } } @@ -588,9 +644,18 @@ smb2_cached_lease_break(struct work_struct *work) */ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) { - struct cifs_open_parms oparams; - int rc; - __le16 srch_path = 0; /* Null - since an open of top of share */ + struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = ses->server; + struct cifs_open_parms oparms; + struct smb2_create_rsp *o_rsp = NULL; + struct smb2_query_info_rsp *qi_rsp = NULL; + int resp_buftype[2]; + struct smb_rqst rqst[2]; + struct kvec rsp_iov[2]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qi_iov[1]; + int rc, flags = 0; + __le16 utf16_path = 0; /* Null - since an open of top of share */ u8 oplock = SMB2_OPLOCK_LEVEL_II; mutex_lock(&tcon->crfid.fid_mutex); @@ -602,22 +667,85 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) return 0; } - oparams.tcon = tcon; - oparams.create_options = 0; - oparams.desired_access = FILE_READ_ATTRIBUTES; - oparams.disposition = FILE_OPEN; - oparams.fid = pfid; - oparams.reconnect = false; + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + /* Open */ + memset(&open_iov, 0, sizeof(open_iov)); + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; + + oparms.tcon = tcon; + oparms.create_options = 0; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.fid = pfid; + oparms.reconnect = false; + + rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &utf16_path); + if (rc) + goto oshr_exit; + smb2_set_next_command(tcon, &rqst[0]); + + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[1].rq_iov = qi_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, + COMPOUND_FID, FILE_ALL_INFORMATION, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + + PATH_MAX * 2, 0, NULL); + if (rc) + goto oshr_exit; - rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL); - if (rc == 0) { - memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); - tcon->crfid.tcon = tcon; - tcon->crfid.is_valid = true; - kref_init(&tcon->crfid.refcount); + smb2_set_related(&rqst[1]); + + rc = compound_send_recv(xid, ses, flags, 2, rqst, + resp_buftype, rsp_iov); + if (rc) + goto oshr_exit; + + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; + oparms.fid->persistent_fid = o_rsp->PersistentFileId; + oparms.fid->volatile_fid = o_rsp->VolatileFileId; +#ifdef CONFIG_CIFS_DEBUG2 + oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId); +#endif /* CIFS_DEBUG2 */ + + memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); + tcon->crfid.tcon = tcon; + tcon->crfid.is_valid = true; + kref_init(&tcon->crfid.refcount); + + if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { kref_get(&tcon->crfid.refcount); - } + oplock = smb2_parse_lease_state(server, o_rsp, + &oparms.fid->epoch, + oparms.fid->lease_key); + } else + goto oshr_exit; + + qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) + goto oshr_exit; + if (!smb2_validate_and_copy_iov( + le16_to_cpu(qi_rsp->OutputBufferOffset), + sizeof(struct smb2_file_all_info), + &rsp_iov[1], sizeof(struct smb2_file_all_info), + (char *)&tcon->crfid.file_all_info)) + tcon->crfid.file_all_info_is_valid = 1; + + oshr_exit: mutex_unlock(&tcon->crfid.fid_mutex); + SMB2_open_free(&rqst[0]); + SMB2_query_info_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); return rc; } @@ -940,6 +1068,16 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; memset(rsp_iov, 0, sizeof(rsp_iov)); + if (ses->server->ops->query_all_EAs) { + if (!ea_value) { + rc = ses->server->ops->query_all_EAs(xid, tcon, path, + ea_name, NULL, 0, + cifs_sb); + if (rc == -ENODATA) + goto sea_exit; + } + } + /* Open */ memset(&open_iov, 0, sizeof(open_iov)); rqst[0].rq_iov = open_iov; @@ -1157,7 +1295,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */, - NULL, 0 /* no input */, + NULL, 0 /* no input */, CIFSMaxBufSize, (char **)&res_key, &ret_data_len); if (rc) { @@ -1188,7 +1326,8 @@ smb2_ioctl_query_info(const unsigned int xid, struct smb_query_info __user *pqi; int rc = 0; int flags = 0; - struct smb2_query_info_rsp *rsp = NULL; + struct smb2_query_info_rsp *qi_rsp = NULL; + struct smb2_ioctl_rsp *io_rsp = NULL; void *buffer = NULL; struct smb_rqst rqst[3]; int resp_buftype[3]; @@ -1198,6 +1337,7 @@ smb2_ioctl_query_info(const unsigned int xid, u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_fid fid; struct kvec qi_iov[1]; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec close_iov[1]; memset(rqst, 0, sizeof(rqst)); @@ -1248,15 +1388,35 @@ smb2_ioctl_query_info(const unsigned int xid, smb2_set_next_command(tcon, &rqst[0]); /* Query */ - memset(&qi_iov, 0, sizeof(qi_iov)); - rqst[1].rq_iov = qi_iov; - rqst[1].rq_nvec = 1; - - rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID, - qi.file_info_class, qi.info_type, - qi.additional_information, + if (qi.flags & PASSTHRU_FSCTL) { + /* Can eventually relax perm check since server enforces too */ + if (!capable(CAP_SYS_ADMIN)) + rc = -EPERM; + else { + memset(&io_iov, 0, sizeof(io_iov)); + rqst[1].rq_iov = io_iov; + rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, &rqst[1], + COMPOUND_FID, COMPOUND_FID, + qi.info_type, true, NULL, + 0, CIFSMaxBufSize); + } + } else if (qi.flags == PASSTHRU_QUERY_INFO) { + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[1].rq_iov = qi_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, + COMPOUND_FID, qi.file_info_class, + qi.info_type, qi.additional_information, qi.input_buffer_length, qi.output_buffer_length, buffer); + } else { /* unknown flags */ + cifs_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags); + rc = -EINVAL; + } + if (rc) goto iqinf_exit; smb2_set_next_command(tcon, &rqst[1]); @@ -1276,24 +1436,44 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype, rsp_iov); if (rc) goto iqinf_exit; - pqi = (struct smb_query_info __user *)arg; - rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; - if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length) - qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength); - if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) { - rc = -EFAULT; - goto iqinf_exit; - } - if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) { - rc = -EFAULT; - goto iqinf_exit; + if (qi.flags & PASSTHRU_FSCTL) { + pqi = (struct smb_query_info __user *)arg; + io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length) + qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount); + if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto iqinf_exit; + } + if (copy_to_user(pqi + 1, &io_rsp[1], qi.input_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } + } else { + pqi = (struct smb_query_info __user *)arg; + qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length) + qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength); + if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto iqinf_exit; + } + if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } } iqinf_exit: kfree(buffer); SMB2_open_free(&rqst[0]); - SMB2_query_info_free(&rqst[1]); + if (qi.flags & PASSTHRU_FSCTL) + SMB2_ioctl_free(&rqst[1]); + else + SMB2_query_info_free(&rqst[1]); + SMB2_close_free(&rqst[2]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); @@ -1348,8 +1528,8 @@ smb2_copychunk_range(const unsigned int xid, rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, true /* is_fsctl */, (char *)pcchunk, - sizeof(struct copychunk_ioctl), (char **)&retbuf, - &ret_data_len); + sizeof(struct copychunk_ioctl), CIFSMaxBufSize, + (char **)&retbuf, &ret_data_len); if (rc == 0) { if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) { @@ -1509,7 +1689,7 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_SPARSE, true /* is_fctl */, - &setsparse, 1, NULL, NULL); + &setsparse, 1, CIFSMaxBufSize, NULL, NULL); if (rc) { tcon->broken_sparse_sup = true; cifs_dbg(FYI, "set sparse rc = %d\n", rc); @@ -1582,7 +1762,7 @@ smb2_duplicate_extents(const unsigned int xid, true /* is_fsctl */, (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), - NULL, + CIFSMaxBufSize, NULL, &ret_data_len); if (ret_data_len > 0) @@ -1617,7 +1797,7 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, true /* is_fsctl */, (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), - NULL, + CIFSMaxBufSize, NULL, &ret_data_len); } @@ -1625,6 +1805,8 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, /* GMT Token is @GMT-YYYY.MM.DD-HH.MM.SS Unicode which is 48 bytes + null */ #define GMT_TOKEN_SIZE 50 +#define MIN_SNAPSHOT_ARRAY_SIZE 16 /* See MS-SMB2 section 3.3.5.15.1 */ + /* * Input buffer contains (empty) struct smb_snapshot array with size filled in * For output see struct SRV_SNAPSHOT_ARRAY in MS-SMB2 section 2.2.32.2 @@ -1636,13 +1818,29 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, char *retbuf = NULL; unsigned int ret_data_len = 0; int rc; + u32 max_response_size; struct smb_snapshot_array snapshot_in; + if (get_user(ret_data_len, (unsigned int __user *)ioc_buf)) + return -EFAULT; + + /* + * Note that for snapshot queries that servers like Azure expect that + * the first query be minimal size (and just used to get the number/size + * of previous versions) so response size must be specified as EXACTLY + * sizeof(struct snapshot_array) which is 16 when rounded up to multiple + * of eight bytes. + */ + if (ret_data_len == 0) + max_response_size = MIN_SNAPSHOT_ARRAY_SIZE; + else + max_response_size = CIFSMaxBufSize; + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SRV_ENUMERATE_SNAPSHOTS, true /* is_fsctl */, - NULL, 0 /* no input data */, + NULL, 0 /* no input data */, max_response_size, (char **)&retbuf, &ret_data_len); cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n", @@ -1753,14 +1951,14 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon, * the number of credits and return true. Otherwise - return false. */ static bool -smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length) +smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; if (shdr->Status != STATUS_PENDING) return false; - if (!length) { + if (shdr->CreditRequest) { spin_lock(&server->req_lock); server->credits += le16_to_cpu(shdr->CreditRequest); spin_unlock(&server->req_lock); @@ -2120,7 +2318,7 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_DFS_GET_REFERRALS, true /* is_fsctl */, - (char *)dfs_req, dfs_req_size, + (char *)dfs_req, dfs_req_size, CIFSMaxBufSize, (char **)&dfs_rsp, &dfs_rsp_size); } while (rc == -EAGAIN); @@ -2191,6 +2389,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov, &resp_buftype); + if (!rc) + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); if (!rc || !err_iov.iov_base) { rc = -ENOENT; goto free_path; @@ -2407,22 +2607,38 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb, static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, loff_t offset, loff_t len, bool keep_size) { + struct cifs_ses *ses = tcon->ses; struct inode *inode; struct cifsInodeInfo *cifsi; struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; + struct smb_rqst rqst[2]; + int resp_buftype[2]; + struct kvec rsp_iov[2]; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec si_iov[1]; + unsigned int size[1]; + void *data[1]; long rc; unsigned int xid; + int num = 0, flags = 0; + __le64 eof; xid = get_xid(); inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); + trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len); + + /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) if (keep_size == false) { rc = -EOPNOTSUPP; + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, + tcon->tid, ses->Suid, offset, len, rc); free_xid(xid); return rc; } @@ -2433,33 +2649,74 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, */ if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) { rc = -EOPNOTSUPP; + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len, rc); free_xid(xid); return rc; } - /* - * need to make sure we are not asked to extend the file since the SMB3 - * fsctl does not change the file size. In the future we could change - * this to zero the first part of the range then set the file size - * which for a non sparse file would zero the newly extended range - */ - if (keep_size == false) - if (i_size_read(inode) < offset + len) { - rc = -EOPNOTSUPP; - free_xid(xid); - return rc; - } - cifs_dbg(FYI, "offset %lld len %lld", offset, len); fsctl_buf.FileOffset = cpu_to_le64(offset); fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); - rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, - true /* is_fctl */, (char *)&fsctl_buf, - sizeof(struct file_zero_data_information), NULL, NULL); + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + + memset(&io_iov, 0, sizeof(io_iov)); + rqst[num].rq_iov = io_iov; + rqst[num].rq_nvec = SMB2_IOCTL_IOV_SIZE; + rc = SMB2_ioctl_init(tcon, &rqst[num++], cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, + true /* is_fctl */, (char *)&fsctl_buf, + sizeof(struct file_zero_data_information), + CIFSMaxBufSize); + if (rc) + goto zero_range_exit; + + /* + * do we also need to change the size of the file? + */ + if (keep_size == false && i_size_read(inode) < offset + len) { + smb2_set_next_command(tcon, &rqst[0]); + + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num].rq_iov = si_iov; + rqst[num].rq_nvec = 1; + + eof = cpu_to_le64(offset + len); + size[0] = 8; /* sizeof __le64 */ + data[0] = &eof; + + rc = SMB2_set_info_init(tcon, &rqst[num++], + cfile->fid.persistent_fid, + cfile->fid.persistent_fid, + current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_related(&rqst[1]); + } + + rc = compound_send_recv(xid, ses, flags, num, rqst, + resp_buftype, rsp_iov); + + zero_range_exit: + SMB2_ioctl_free(&rqst[0]); + SMB2_set_info_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_xid(xid); + if (rc) + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len, rc); + else + trace_smb3_zero_done(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len); return rc; } @@ -2495,7 +2752,8 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, true /* is_fctl */, (char *)&fsctl_buf, - sizeof(struct file_zero_data_information), NULL, NULL); + sizeof(struct file_zero_data_information), + CIFSMaxBufSize, NULL, NULL); free_xid(xid); return rc; } @@ -2508,15 +2766,20 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, struct cifsFileInfo *cfile = file->private_data; long rc = -EOPNOTSUPP; unsigned int xid; + __le64 eof; xid = get_xid(); inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); + trace_smb3_falloc_enter(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len); /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) if (keep_size == false) { + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); free_xid(xid); return rc; } @@ -2536,6 +2799,12 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, /* BB: in future add else clause to extend file */ else rc = -EOPNOTSUPP; + if (rc) + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); + else + trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len); free_xid(xid); return rc; } @@ -2551,14 +2820,31 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, */ if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) { rc = -EOPNOTSUPP; + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); free_xid(xid); return rc; } - rc = smb2_set_sparse(xid, tcon, cfile, inode, false); + smb2_set_sparse(xid, tcon, cfile, inode, false); + rc = 0; + } else { + smb2_set_sparse(xid, tcon, cfile, inode, false); + rc = 0; + if (i_size_read(inode) < off + len) { + eof = cpu_to_le64(off + len); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, + &eof); + } } - /* BB: else ... in future add code to extend file and set sparse */ + if (rc) + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len, rc); + else + trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len); free_xid(xid); return rc; @@ -2595,6 +2881,15 @@ smb2_downgrade_oplock(struct TCP_Server_Info *server, } static void +smb21_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + server->ops->set_oplock_level(cinode, + set_level2 ? SMB2_LEASE_READ_CACHING_HE : + 0, 0, NULL); +} + +static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { @@ -3210,15 +3505,15 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, } if (server->ops->is_status_pending && - server->ops->is_status_pending(buf, server, 0)) + server->ops->is_status_pending(buf, server)) return -1; /* set up first two iov to get credits */ rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; + rdata->iov[0].iov_len = 0; + rdata->iov[1].iov_base = buf; rdata->iov[1].iov_len = - min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4; + min_t(unsigned int, buf_len, server->vals->read_rsp_size); cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", rdata->iov[0].iov_base, rdata->iov[0].iov_len); cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", @@ -3530,6 +3825,104 @@ smb2_next_header(char *buf) return le32_to_cpu(hdr->NextCommand); } +static int +smb2_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + int rc = -EPERM; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; + FILE_ALL_INFO *buf = NULL; + struct cifs_io_parms io_parms; + __u32 oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + unsigned int bytes_written; + struct win_dev *pdev; + struct kvec iov[2]; + + /* + * Check if mounted with mount parm 'sfu' mount parm. + * SFU emulation should work with all servers, but only + * supports block and char device (no socket & fifo), + * and was used by default in earlier versions of Windows + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto out; + + /* + * TODO: Add ability to create instead via reparse point. Windows (e.g. + * their current NFS server) uses this approach to expose special files + * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions + */ + + if (!S_ISCHR(mode) && !S_ISBLK(mode)) + goto out; + + cifs_dbg(FYI, "sfu compat create special file\n"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + if (rc) + goto out; + + /* + * BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely. + */ + + pdev = (struct win_dev *)buf; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = sizeof(struct win_dev); + iov[1].iov_base = buf; + iov[1].iov_len = sizeof(struct win_dev); + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } + tcon->ses->server->ops->close(xid, tcon, &fid); + d_drop(dentry); + + /* FIXME: add code here to set EAs */ +out: + kfree(buf); + return rc; +} + + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -3541,6 +3934,7 @@ struct smb_version_operations smb20_operations = { .get_credits = smb2_get_credits, .wait_mtu_credits = cifs_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3623,6 +4017,7 @@ struct smb_version_operations smb20_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb21_operations = { @@ -3635,7 +4030,9 @@ struct smb_version_operations smb21_operations = { .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, + .adjust_credits = smb2_adjust_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3646,7 +4043,7 @@ struct smb_version_operations smb21_operations = { .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -3719,6 +4116,7 @@ struct smb_version_operations smb21_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb30_operations = { @@ -3731,7 +4129,9 @@ struct smb_version_operations smb30_operations = { .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, + .adjust_credits = smb2_adjust_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3743,7 +4143,7 @@ struct smb_version_operations smb30_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb3_negotiate_wsize, @@ -3824,6 +4224,7 @@ struct smb_version_operations smb30_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb311_operations = { @@ -3836,7 +4237,9 @@ struct smb_version_operations smb311_operations = { .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, + .adjust_credits = smb2_adjust_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3848,7 +4251,7 @@ struct smb_version_operations smb311_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb3_negotiate_wsize, @@ -3930,6 +4333,7 @@ struct smb_version_operations smb311_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 77b3aaa39b35..a37774a55f3a 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -107,13 +107,13 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, struct TCP_Server_Info *server = tcon->ses->server; spin_lock(&server->req_lock); - /* Request up to 2 credits but don't go over the limit. */ + /* Request up to 10 credits but don't go over the limit. */ if (server->credits >= server->max_credits) shdr->CreditRequest = cpu_to_le16(0); else shdr->CreditRequest = cpu_to_le16( min_t(int, server->max_credits - - server->credits, 2)); + server->credits, 10)); spin_unlock(&server->req_lock); } else { shdr->CreditRequest = cpu_to_le16(2); @@ -173,8 +173,8 @@ static int __smb2_reconnect(const struct nls_table *nlsc, return -ENOMEM; if (tcon->ipc) { - snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", - tcon->ses->server->hostname); + scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", + tcon->ses->server->hostname); rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); goto out; } @@ -206,7 +206,7 @@ static int __smb2_reconnect(const struct nls_table *nlsc, continue; } - snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); + scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); if (!rc) @@ -490,6 +490,23 @@ build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt) { pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE; pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN); + /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */ + pneg_ctxt->Name[0] = 0x93; + pneg_ctxt->Name[1] = 0xAD; + pneg_ctxt->Name[2] = 0x25; + pneg_ctxt->Name[3] = 0x50; + pneg_ctxt->Name[4] = 0x9C; + pneg_ctxt->Name[5] = 0xB4; + pneg_ctxt->Name[6] = 0x11; + pneg_ctxt->Name[7] = 0xE7; + pneg_ctxt->Name[8] = 0xB4; + pneg_ctxt->Name[9] = 0x23; + pneg_ctxt->Name[10] = 0x83; + pneg_ctxt->Name[11] = 0xDE; + pneg_ctxt->Name[12] = 0x96; + pneg_ctxt->Name[13] = 0x8B; + pneg_ctxt->Name[14] = 0xCD; + pneg_ctxt->Name[15] = 0x7C; } static void @@ -815,8 +832,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) } else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) { /* ops set to 3.0 by default for default so update */ ses->server->ops = &smb21_operations; - } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) + ses->server->vals = &smb21_values; + } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { ses->server->ops = &smb311_operations; + ses->server->vals = &smb311_values; + } } else if (le16_to_cpu(rsp->DialectRevision) != ses->server->vals->protocol_id) { /* if requested single dialect ensure returned dialect matched */ @@ -985,9 +1005,16 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, - (char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen); - - if (rc != 0) { + (char *)pneg_inbuf, inbuflen, CIFSMaxBufSize, + (char **)&pneg_rsp, &rsplen); + if (rc == -EOPNOTSUPP) { + /* + * Old Windows versions or Netapp SMB server can return + * not supported error. Client should accept it. + */ + cifs_dbg(VFS, "Server does not support validate negotiate\n"); + return 0; + } else if (rc != 0) { cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); rc = -EIO; goto out_free_inbuf; @@ -1605,15 +1632,25 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; - /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */ + /* + * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 + * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1 + * (Samba servers don't always set the flag so also check if null user) + */ if ((ses->server->dialect == SMB311_PROT_ID) && - !smb3_encryption_required(tcon)) + !smb3_encryption_required(tcon) && + !(ses->session_flags & + (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) && + ((ses->user_name != NULL) || (ses->sectype == Kerberos))) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; memset(&rqst, 0, sizeof(struct smb_rqst)); rqst.rq_iov = iov; rqst.rq_nvec = 2; + /* Need 64 for max size write so ask for more in case not there yet */ + req->sync_hdr.CreditRequest = cpu_to_le16(64); + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; @@ -1771,9 +1808,10 @@ create_reconnect_durable_buf(struct cifs_fid *fid) return buf; } -static __u8 -parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, - unsigned int *epoch, char *lease_key) +__u8 +smb2_parse_lease_state(struct TCP_Server_Info *server, + struct smb2_create_rsp *rsp, + unsigned int *epoch, char *lease_key) { char *data_offset; struct create_context *cc; @@ -1824,8 +1862,9 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, } static struct create_durable_v2 * -create_durable_v2_buf(struct cifs_fid *pfid) +create_durable_v2_buf(struct cifs_open_parms *oparms) { + struct cifs_fid *pfid = oparms->fid; struct create_durable_v2 *buf; buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL); @@ -1839,7 +1878,14 @@ create_durable_v2_buf(struct cifs_fid *pfid) (struct create_durable_v2, Name)); buf->ccontext.NameLength = cpu_to_le16(4); - buf->dcontext.Timeout = 0; /* Should this be configurable by workload */ + /* + * NB: Handle timeout defaults to 0, which allows server to choose + * (most servers default to 120 seconds) and most clients default to 0. + * This can be overridden at mount ("handletimeout=") if the user wants + * a different persistent (or resilient) handle timeout for all opens + * opens on a particular SMB3 mount. + */ + buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); generate_random_uuid(buf->dcontext.CreateGuid); memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); @@ -1892,7 +1938,7 @@ add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec, struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; - iov[num].iov_base = create_durable_v2_buf(oparms->fid); + iov[num].iov_base = create_durable_v2_buf(oparms); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_durable_v2); @@ -2170,6 +2216,8 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, rqst.rq_iov = iov; rqst.rq_nvec = n_iov; + trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE, + FILE_WRITE_ATTRIBUTES); /* resource #4: response buffer */ rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); if (rc) { @@ -2388,6 +2436,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc) goto creat_exit; + trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid, + oparms->create_options, oparms->desired_access); + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; @@ -2425,8 +2476,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) - *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch, - oparms->fid->lease_key); + *oplock = smb2_parse_lease_state(server, rsp, + &oparms->fid->epoch, + oparms->fid->lease_key); else *oplock = rsp->OplockLevel; creat_exit: @@ -2435,113 +2487,138 @@ creat_exit: return rc; } -/* - * SMB2 IOCTL is used for both IOCTLs and FSCTLs - */ int -SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 opcode, bool is_fsctl, - char *in_data, u32 indatalen, - char **out_data, u32 *plen /* returned data len */) +SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 opcode, + bool is_fsctl, char *in_data, u32 indatalen, + __u32 max_response_size) { - struct smb_rqst rqst; struct smb2_ioctl_req *req; - struct smb2_ioctl_rsp *rsp; - struct cifs_ses *ses; - struct kvec iov[2]; - struct kvec rsp_iov; - int resp_buftype; - int n_iov; - int rc = 0; - int flags = 0; + struct kvec *iov = rqst->rq_iov; unsigned int total_len; - - cifs_dbg(FYI, "SMB2 IOCTL\n"); - - if (out_data != NULL) - *out_data = NULL; - - /* zero out returned data len, in case of error */ - if (plen) - *plen = 0; - - if (tcon) - ses = tcon->ses; - else - return -EIO; - - if (!ses || !(ses->server)) - return -EIO; + int rc; rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len); if (rc) return rc; - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - req->CtlCode = cpu_to_le32(opcode); req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; + iov[0].iov_base = (char *)req; + /* + * If no input data, the size of ioctl struct in + * protocol spec still includes a 1 byte data buffer, + * but if input data passed to ioctl, we do not + * want to double count this, so we do not send + * the dummy one byte of data in iovec[0] if sending + * input data (in iovec[1]). + */ if (indatalen) { req->InputCount = cpu_to_le32(indatalen); /* do not set InputOffset if no input data */ req->InputOffset = cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer)); + rqst->rq_nvec = 2; + iov[0].iov_len = total_len - 1; iov[1].iov_base = in_data; iov[1].iov_len = indatalen; - n_iov = 2; - } else - n_iov = 1; + } else { + rqst->rq_nvec = 1; + iov[0].iov_len = total_len; + } req->OutputOffset = 0; req->OutputCount = 0; /* MBZ */ /* - * Could increase MaxOutputResponse, but that would require more - * than one credit. Windows typically sets this smaller, but for some + * In most cases max_response_size is set to 16K (CIFSMaxBufSize) + * We Could increase default MaxOutputResponse, but that could require + * more credits. Windows typically sets this smaller, but for some * ioctls it may be useful to allow server to send more. No point * limiting what the server can send as long as fits in one credit - * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE - * (by default, note that it can be overridden to make max larger) - * in responses (except for read responses which can be bigger. - * We may want to bump this limit up + * We can not handle more than CIFS_MAX_BUF_SIZE yet but may want + * to increase this limit up in the future. + * Note that for snapshot queries that servers like Azure expect that + * the first query be minimal size (and just used to get the number/size + * of previous versions) so response size must be specified as EXACTLY + * sizeof(struct snapshot_array) which is 16 when rounded up to multiple + * of eight bytes. Currently that is the only case where we set max + * response size smaller. */ - req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize); + req->MaxOutputResponse = cpu_to_le32(max_response_size); if (is_fsctl) req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); else req->Flags = 0; - iov[0].iov_base = (char *)req; - - /* - * If no input data, the size of ioctl struct in - * protocol spec still includes a 1 byte data buffer, - * but if input data passed to ioctl, we do not - * want to double count this, so we do not send - * the dummy one byte of data in iovec[0] if sending - * input data (in iovec[1]). - */ - - if (indatalen) { - iov[0].iov_len = total_len - 1; - } else - iov[0].iov_len = total_len; - /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + return 0; +} + +void +SMB2_ioctl_free(struct smb_rqst *rqst) +{ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ +} + + +/* + * SMB2 IOCTL is used for both IOCTLs and FSCTLs + */ +int +SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid, u32 opcode, bool is_fsctl, + char *in_data, u32 indatalen, u32 max_out_data_len, + char **out_data, u32 *plen /* returned data len */) +{ + struct smb_rqst rqst; + struct smb2_ioctl_rsp *rsp = NULL; + struct cifs_ses *ses; + struct kvec iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec rsp_iov = {NULL, 0}; + int resp_buftype = CIFS_NO_BUFFER; + int rc = 0; + int flags = 0; + + cifs_dbg(FYI, "SMB2 IOCTL\n"); + + if (out_data != NULL) + *out_data = NULL; + + /* zero out returned data len, in case of error */ + if (plen) + *plen = 0; + + if (tcon) + ses = tcon->ses; + else + return -EIO; + + if (!ses || !(ses->server)) + return -EIO; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + memset(&rqst, 0, sizeof(struct smb_rqst)); + memset(&iov, 0, sizeof(iov)); rqst.rq_iov = iov; - rqst.rq_nvec = n_iov; + rqst.rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid, opcode, + is_fsctl, in_data, indatalen, max_out_data_len); + if (rc) + goto ioctl_exit; rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; if (rc != 0) @@ -2591,6 +2668,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, } ioctl_exit: + SMB2_ioctl_free(&rqst); free_rsp_buf(resp_buftype, rsp); return rc; } @@ -2613,7 +2691,8 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, (char *)&fsctl_input /* data input */, - 2 /* in data len */, &ret_data /* out data */, NULL); + 2 /* in data len */, CIFSMaxBufSize /* max out data */, + &ret_data /* out data */, NULL); cifs_dbg(FYI, "set compression rc %d\n", rc); @@ -2837,6 +2916,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto qinf_exit; + trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid, + ses->Suid, info_class, (__u32)info_type); + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; @@ -2847,6 +2929,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, goto qinf_exit; } + trace_smb3_query_info_done(xid, persistent_fid, tcon->tid, + ses->Suid, info_class, (__u32)info_type); + if (dlen) { *dlen = le32_to_cpu(rsp->OutputBufferLength); if (!*data) { @@ -2924,14 +3009,16 @@ smb2_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf; - unsigned int credits_received = 0; + struct cifs_credits credits = { .value = 0, .instance = 0 }; if (mid->mid_state == MID_RESPONSE_RECEIVED - || mid->mid_state == MID_RESPONSE_MALFORMED) - credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); + || mid->mid_state == MID_RESPONSE_MALFORMED) { + credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.instance = server->reconnect_instance; + } DeleteMidQEntry(mid); - add_credits(server, credits_received, CIFS_ECHO_OP); + add_credits(server, &credits, CIFS_ECHO_OP); } void smb2_reconnect_server(struct work_struct *work) @@ -3023,7 +3110,7 @@ SMB2_echo(struct TCP_Server_Info *server) iov[0].iov_base = (char *)req; rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL, - server, CIFS_ECHO_OP); + server, CIFS_ECHO_OP, NULL); if (rc) cifs_dbg(FYI, "Echo request failed: %d\n", rc); @@ -3114,6 +3201,11 @@ smb2_new_read_req(void **buf, unsigned int *total_len, req->MinimumCount = 0; req->Length = cpu_to_le32(io_parms->length); req->Offset = cpu_to_le64(io_parms->offset); + + trace_smb3_read_enter(0 /* xid */, + io_parms->persistent_fid, + io_parms->tcon->tid, io_parms->tcon->ses->Suid, + io_parms->offset, io_parms->length); #ifdef CONFIG_CIFS_SMB_DIRECT /* * If we want to do a RDMA write, fill in and append @@ -3184,7 +3276,7 @@ smb2_readv_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = tcon->ses->server; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rdata->iov[0].iov_base; - unsigned int credits_received = 0; + struct cifs_credits credits = { .value = 0, .instance = 0 }; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_pages = rdata->pages, @@ -3199,7 +3291,8 @@ smb2_readv_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - credits_received = le16_to_cpu(shdr->CreditRequest); + credits.value = le16_to_cpu(shdr->CreditRequest); + credits.instance = server->reconnect_instance; /* result already set, check signature */ if (server->sign && !mid->decrypted) { int rc; @@ -3224,11 +3317,11 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_RESPONSE_MALFORMED: - credits_received = le16_to_cpu(shdr->CreditRequest); + credits.value = le16_to_cpu(shdr->CreditRequest); + credits.instance = server->reconnect_instance; /* fall through */ default: - if (rdata->result != -ENODATA) - rdata->result = -EIO; + rdata->result = -EIO; } #ifdef CONFIG_CIFS_SMB_DIRECT /* @@ -3255,7 +3348,7 @@ smb2_readv_callback(struct mid_q_entry *mid) queue_work(cifsiod_wq, &rdata->work); DeleteMidQEntry(mid); - add_credits(server, credits_received, 0); + add_credits(server, &credits, 0); } /* smb2_async_readv - send an async read, and set up mid to handle result */ @@ -3285,17 +3378,8 @@ smb2_async_readv(struct cifs_readdata *rdata) rc = smb2_new_read_req( (void **) &buf, &total_len, &io_parms, rdata, 0, 0); - if (rc) { - if (rc == -EAGAIN && rdata->credits) { - /* credits was reset by reconnect */ - rdata->credits = 0; - /* reduce in_flight value since we won't send the req */ - spin_lock(&server->req_lock); - server->in_flight--; - spin_unlock(&server->req_lock); - } + if (rc) return rc; - } if (smb3_encryption_required(io_parms.tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -3305,24 +3389,24 @@ smb2_async_readv(struct cifs_readdata *rdata) shdr = (struct smb2_sync_hdr *)buf; - if (rdata->credits) { + if (rdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); - spin_lock(&server->req_lock); - server->credits += rdata->credits - - le16_to_cpu(shdr->CreditCharge); - spin_unlock(&server->req_lock); - wake_up(&server->request_q); - rdata->credits = le16_to_cpu(shdr->CreditCharge); + + rc = adjust_credits(server, &rdata->credits, rdata->bytes); + if (rc) + goto async_readv_out; + flags |= CIFS_HAS_CREDITS; } kref_get(&rdata->refcount); rc = cifs_call_async(io_parms.tcon->ses->server, &rqst, cifs_readv_receive, smb2_readv_callback, - smb3_handle_read_data, rdata, flags); + smb3_handle_read_data, rdata, flags, + &rdata->credits); if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); @@ -3332,6 +3416,7 @@ smb2_async_readv(struct cifs_readdata *rdata) io_parms.offset, io_parms.length, rc); } +async_readv_out: cifs_small_buf_release(buf); return rc; } @@ -3366,8 +3451,6 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, rqst.rq_nvec = 1; rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); - rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; if (rc) { @@ -3378,14 +3461,20 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length, rc); - } + } else + trace_smb3_read_done(xid, req->PersistentFileId, + io_parms->tcon->tid, ses->Suid, + io_parms->offset, 0); free_rsp_buf(resp_buftype, rsp_iov.iov_base); + cifs_small_buf_release(req); return rc == -ENODATA ? 0 : rc; } else trace_smb3_read_done(xid, req->PersistentFileId, io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length); + cifs_small_buf_release(req); + *nbytes = le32_to_cpu(rsp->DataLength); if ((*nbytes > CIFS_MAX_MSGSIZE) || (*nbytes > io_parms->length)) { @@ -3417,14 +3506,16 @@ smb2_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; - unsigned int credits_received = 0; + struct cifs_credits credits = { .value = 0, .instance = 0 }; switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); - wdata->result = smb2_check_receive(mid, tcon->ses->server, 0); + credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.instance = server->reconnect_instance; + wdata->result = smb2_check_receive(mid, server, 0); if (wdata->result != 0) break; @@ -3448,7 +3539,8 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->result = -EAGAIN; break; case MID_RESPONSE_MALFORMED: - credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.instance = server->reconnect_instance; /* fall through */ default: wdata->result = -EIO; @@ -3481,7 +3573,7 @@ smb2_writev_callback(struct mid_q_entry *mid) queue_work(cifsiod_wq, &wdata->work); DeleteMidQEntry(mid); - add_credits(tcon->ses->server, credits_received, 0); + add_credits(server, &credits, 0); } /* smb2_async_writev - send an async write, and set up mid to handle result */ @@ -3499,17 +3591,8 @@ smb2_async_writev(struct cifs_writedata *wdata, unsigned int total_len; rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len); - if (rc) { - if (rc == -EAGAIN && wdata->credits) { - /* credits was reset by reconnect */ - wdata->credits = 0; - /* reduce in_flight value since we won't send the req */ - spin_lock(&server->req_lock); - server->in_flight--; - spin_unlock(&server->req_lock); - } - goto async_writev_out; - } + if (rc) + return rc; if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -3526,6 +3609,9 @@ smb2_async_writev(struct cifs_writedata *wdata, req->DataOffset = cpu_to_le16( offsetof(struct smb2_write_req, Buffer)); req->RemainingBytes = 0; + + trace_smb3_write_enter(0 /* xid */, wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes); #ifdef CONFIG_CIFS_SMB_DIRECT /* * If we want to do a server RDMA read, fill in and append @@ -3595,23 +3681,22 @@ smb2_async_writev(struct cifs_writedata *wdata, req->Length = cpu_to_le32(wdata->bytes); #endif - if (wdata->credits) { + if (wdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); - spin_lock(&server->req_lock); - server->credits += wdata->credits - - le16_to_cpu(shdr->CreditCharge); - spin_unlock(&server->req_lock); - wake_up(&server->request_q); - wdata->credits = le16_to_cpu(shdr->CreditCharge); + + rc = adjust_credits(server, &wdata->credits, wdata->bytes); + if (rc) + goto async_writev_out; + flags |= CIFS_HAS_CREDITS; } kref_get(&wdata->refcount); rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL, - wdata, flags); + wdata, flags, &wdata->credits); if (rc) { trace_smb3_write_err(0 /* no xid */, req->PersistentFileId, @@ -3674,6 +3759,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, offsetof(struct smb2_write_req, Buffer)); req->RemainingBytes = 0; + trace_smb3_write_enter(xid, io_parms->persistent_fid, + io_parms->tcon->tid, io_parms->tcon->ses->Suid, + io_parms->offset, io_parms->length); + iov[0].iov_base = (char *)req; /* 1 for Buffer */ iov[0].iov_len = total_len - 1; @@ -3684,7 +3773,6 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; if (rc) { @@ -3702,6 +3790,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, io_parms->offset, *nbytes); } + cifs_small_buf_release(req); free_rsp_buf(resp_buftype, rsp); return rc; } @@ -3836,6 +3925,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 2; + trace_smb3_query_dir_enter(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, output_size); + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; @@ -3843,18 +3935,26 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { if (rc == -ENODATA && rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + trace_smb3_query_dir_done(xid, persistent_fid, + tcon->tid, tcon->ses->Suid, index, 0); srch_inf->endOfSearch = true; rc = 0; - } else + } else { + trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, 0, rc); cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + } goto qdir_exit; } rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, info_buf_size); - if (rc) + if (rc) { + trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, 0, rc); goto qdir_exit; + } srch_inf->unicode = true; @@ -3882,6 +3982,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, else cifs_dbg(VFS, "illegal search buffer type\n"); + trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, srch_inf->entries_in_buffer); return rc; qdir_exit: diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 538e2299805f..ee8977688e21 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -288,12 +288,12 @@ struct smb2_encryption_neg_context { __le16 Ciphers[1]; /* Ciphers[0] since only one used now */ } __packed; -#define POSIX_CTXT_DATA_LEN 8 +#define POSIX_CTXT_DATA_LEN 16 struct smb2_posix_neg_context { __le16 ContextType; /* 0x100 */ __le16 DataLength; __le32 Reserved; - __le64 Reserved1; /* In case needed for future (eg version or caps) */ + __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */ } __packed; struct smb2_negotiate_rsp { @@ -959,6 +959,13 @@ struct duplicate_extents_to_file { __le64 ByteCount; /* Bytes to be copied */ } __packed; +/* + * Maximum number of iovs we need for an ioctl request. + * [0] : struct smb2_ioctl_req + * [1] : in_data + */ +#define SMB2_IOCTL_IOV_SIZE 2 + struct smb2_ioctl_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 57 */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 87733b27a65f..52df125e9189 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -142,8 +142,13 @@ extern int SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, extern void SMB2_open_free(struct smb_rqst *rqst); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, - bool is_fsctl, char *in_data, u32 indatalen, + bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen, char **out_data, u32 *plen /* returned data len */); +extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 opcode, + bool is_fsctl, char *in_data, u32 indatalen, + __u32 max_response_size); +extern void SMB2_ioctl_free(struct smb_rqst *rqst); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, @@ -223,6 +228,9 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, enum securityEnum); +extern __u8 smb2_parse_lease_state(struct TCP_Server_Info *server, + struct smb2_create_rsp *rsp, + unsigned int *epoch, char *lease_key); extern int smb3_encryption_required(const struct cifs_tcon *tcon); extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size); diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h index 3d5f62150de4..447c0c6e4c64 100644 --- a/fs/cifs/smb2status.h +++ b/fs/cifs/smb2status.h @@ -30,9 +30,9 @@ */ #define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000) -#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001) -#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002) -#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003) +#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001) +#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002) +#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003) struct ntstatus { /* Facility is the high 12 bits of the following field */ diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 7b351c65ee46..d1181572758b 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -576,6 +576,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, struct TCP_Server_Info *server) { struct mid_q_entry *temp; + unsigned int credits = le16_to_cpu(shdr->CreditCharge); if (server == NULL) { cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n"); @@ -586,6 +587,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, memset(temp, 0, sizeof(struct mid_q_entry)); kref_init(&temp->refcount); temp->mid = le64_to_cpu(shdr->MessageId); + temp->credits = credits > 0 ? credits : 1; temp->pid = current->pid; temp->command = shdr->Command; /* Always LE */ temp->when_alloc = jiffies; @@ -600,6 +602,8 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, atomic_inc(&midCount); temp->mid_state = MID_REQUEST_ALLOCATED; + trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId, + le16_to_cpu(shdr->Command), temp->mid); return temp; } @@ -615,6 +619,10 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr, return -EAGAIN; } + if (ses->server->tcpStatus == CifsNeedNegotiate && + shdr->Command != SMB2_NEGOTIATE) + return -EAGAIN; + if (ses->status == CifsNew) { if ((shdr->Command != SMB2_SESSION_SETUP) && (shdr->Command != SMB2_NEGOTIATE)) @@ -634,6 +642,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr, spin_lock(&GlobalMid_Lock); list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q); spin_unlock(&GlobalMid_Lock); + return 0; } @@ -674,13 +683,18 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) smb2_seq_num_into_buf(ses->server, shdr); rc = smb2_get_mid_entry(ses, shdr, &mid); - if (rc) + if (rc) { + revert_current_mid_from_hdr(ses->server, shdr); return ERR_PTR(rc); + } + rc = smb2_sign_rqst(rqst, ses->server); if (rc) { + revert_current_mid_from_hdr(ses->server, shdr); cifs_delete_mid(mid); return ERR_PTR(rc); } + return mid; } @@ -692,14 +706,21 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; + if (server->tcpStatus == CifsNeedNegotiate && + shdr->Command != SMB2_NEGOTIATE) + return ERR_PTR(-EAGAIN); + smb2_seq_num_into_buf(server, shdr); mid = smb2_mid_entry_alloc(shdr, server); - if (mid == NULL) + if (mid == NULL) { + revert_current_mid_from_hdr(server, shdr); return ERR_PTR(-ENOMEM); + } rc = smb2_sign_rqst(rqst, server); if (rc) { + revert_current_mid_from_hdr(server, shdr); DeleteMidQEntry(mid); return ERR_PTR(rc); } diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index a568dac7b3a1..b943b74cd246 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1550,7 +1550,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) char name[MAX_NAME_LEN]; int rc; - snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); info->request_cache = kmem_cache_create( name, @@ -1566,7 +1566,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!info->request_mempool) goto out1; - snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); info->response_cache = kmem_cache_create( name, @@ -1582,7 +1582,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!info->response_mempool) goto out3; - snprintf(name, MAX_NAME_LEN, "smbd_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbd_%p", info); info->workqueue = create_workqueue(name); if (!info->workqueue) goto out4; diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 59be48206932..99c4d799c24b 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -58,6 +58,9 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \ DEFINE_SMB3_RW_ERR_EVENT(write_err); DEFINE_SMB3_RW_ERR_EVENT(read_err); +DEFINE_SMB3_RW_ERR_EVENT(query_dir_err); +DEFINE_SMB3_RW_ERR_EVENT(zero_err); +DEFINE_SMB3_RW_ERR_EVENT(falloc_err); /* For logging successful read or write */ @@ -100,8 +103,16 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \ __u32 len), \ TP_ARGS(xid, fid, tid, sesid, offset, len)) +DEFINE_SMB3_RW_DONE_EVENT(write_enter); +DEFINE_SMB3_RW_DONE_EVENT(read_enter); +DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter); +DEFINE_SMB3_RW_DONE_EVENT(zero_enter); +DEFINE_SMB3_RW_DONE_EVENT(falloc_enter); DEFINE_SMB3_RW_DONE_EVENT(write_done); DEFINE_SMB3_RW_DONE_EVENT(read_done); +DEFINE_SMB3_RW_DONE_EVENT(query_dir_done); +DEFINE_SMB3_RW_DONE_EVENT(zero_done); +DEFINE_SMB3_RW_DONE_EVENT(falloc_done); /* * For handle based calls other than read and write, and get/set info @@ -148,6 +159,48 @@ DEFINE_SMB3_FD_ERR_EVENT(close_err); /* * For handle based query/set info calls */ +DECLARE_EVENT_CLASS(smb3_inf_enter_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + __u8 infclass, + __u32 type), + TP_ARGS(xid, fid, tid, sesid, infclass, type), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u8, infclass) + __field(__u32, type) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->infclass = infclass; + __entry->type = type; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->infclass, __entry->type) +) + +#define DEFINE_SMB3_INF_ENTER_EVENT(name) \ +DEFINE_EVENT(smb3_inf_enter_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + __u8 infclass, \ + __u32 type), \ + TP_ARGS(xid, fid, tid, sesid, infclass, type)) + +DEFINE_SMB3_INF_ENTER_EVENT(query_info_enter); +DEFINE_SMB3_INF_ENTER_EVENT(query_info_done); + DECLARE_EVENT_CLASS(smb3_inf_err_class, TP_PROTO(unsigned int xid, __u64 fid, @@ -195,6 +248,123 @@ DEFINE_SMB3_INF_ERR_EVENT(query_info_err); DEFINE_SMB3_INF_ERR_EVENT(set_info_err); DEFINE_SMB3_INF_ERR_EVENT(fsctl_err); +DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + const char *full_path), + TP_ARGS(xid, tid, sesid, full_path), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __string(path, full_path) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __assign_str(path, full_path); + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s", + __entry->xid, __entry->sesid, __entry->tid, + __get_str(path)) +) + +#define DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_enter_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + const char *full_path), \ + TP_ARGS(xid, tid, sesid, full_path)) + +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); + + +DECLARE_EVENT_CLASS(smb3_inf_compound_done_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid), + TP_ARGS(xid, tid, sesid), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x", + __entry->xid, __entry->sesid, __entry->tid) +) + +#define DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_done_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid), \ + TP_ARGS(xid, tid, sesid)) + +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); + + +DECLARE_EVENT_CLASS(smb3_inf_compound_err_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + int rc), + TP_ARGS(xid, tid, sesid, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->rc = rc; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, + __entry->rc) +) + +#define DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + int rc), \ + TP_ARGS(xid, tid, sesid, rc)) + +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); + /* * For logging SMB3 Status code and Command for responses which return errors */ @@ -270,6 +440,7 @@ DEFINE_EVENT(smb3_cmd_done_class, smb3_##name, \ __u64 mid), \ TP_ARGS(tid, sesid, cmd, mid)) +DEFINE_SMB3_CMD_DONE_EVENT(cmd_enter); DEFINE_SMB3_CMD_DONE_EVENT(cmd_done); DEFINE_SMB3_CMD_DONE_EVENT(ses_expired); @@ -378,19 +549,19 @@ DECLARE_EVENT_CLASS(smb3_tcon_class, __field(unsigned int, xid) __field(__u32, tid) __field(__u64, sesid) - __field(const char *, unc_name) + __string(name, unc_name) __field(int, rc) ), TP_fast_assign( __entry->xid = xid; __entry->tid = tid; __entry->sesid = sesid; - __entry->unc_name = unc_name; + __assign_str(name, unc_name); __entry->rc = rc; ), TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d", __entry->xid, __entry->sesid, __entry->tid, - __entry->unc_name, __entry->rc) + __get_str(name), __entry->rc) ) #define DEFINE_SMB3_TCON_EVENT(name) \ @@ -406,8 +577,47 @@ DEFINE_SMB3_TCON_EVENT(tcon); /* - * For smb2/smb3 open call + * For smb2/smb3 open (including create and mkdir) calls */ + +DECLARE_EVENT_CLASS(smb3_open_enter_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + int create_options, + int desired_access), + TP_ARGS(xid, tid, sesid, create_options, desired_access), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, create_options) + __field(int, desired_access) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->create_options = create_options; + __entry->desired_access = desired_access; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x", + __entry->xid, __entry->sesid, __entry->tid, + __entry->create_options, __entry->desired_access) +) + +#define DEFINE_SMB3_OPEN_ENTER_EVENT(name) \ +DEFINE_EVENT(smb3_open_enter_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + int create_options, \ + int desired_access), \ + TP_ARGS(xid, tid, sesid, create_options, desired_access)) + +DEFINE_SMB3_OPEN_ENTER_EVENT(open_enter); +DEFINE_SMB3_OPEN_ENTER_EVENT(posix_mkdir_enter); + DECLARE_EVENT_CLASS(smb3_open_err_class, TP_PROTO(unsigned int xid, __u32 tid, @@ -626,6 +836,7 @@ DEFINE_EVENT(smb3_credit_class, smb3_##name, \ TP_ARGS(currmid, hostname, credits)) DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); +DEFINE_SMB3_CREDIT_EVENT(credit_timeout); #endif /* _CIFS_TRACE_H */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 53532bd3f50d..1de8e996e566 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -33,6 +33,7 @@ #include <linux/uaccess.h> #include <asm/processor.h> #include <linux/mempool.h> +#include <linux/signal.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -291,6 +292,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, int n_vec; unsigned int send_length = 0; unsigned int i, j; + sigset_t mask, oldmask; size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; struct msghdr smb_msg; @@ -301,8 +303,14 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rc = smbd_send(server, rqst); goto smbd_done; } + if (ssocket == NULL) - return -ENOTSOCK; + return -EAGAIN; + + if (signal_pending(current)) { + cifs_dbg(FYI, "signal is pending before sending any data\n"); + return -EINTR; + } /* cork the socket */ kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, @@ -312,6 +320,16 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, send_length += smb_rqst_len(server, &rqst[j]); rfc1002_marker = cpu_to_be32(send_length); + /* + * We should not allow signals to interrupt the network send because + * any partial send will cause session reconnects thus increasing + * latency of system calls and overload a server with unnecessary + * requests. + */ + + sigfillset(&mask); + sigprocmask(SIG_BLOCK, &mask, &oldmask); + /* Generate a rfc1002 marker for SMB2+ */ if (server->vals->header_preamble_size == 0) { struct kvec hiov = { @@ -321,7 +339,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) - goto uncork; + goto unmask; total_len += sent; send_length += 4; @@ -343,7 +361,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) - goto uncork; + goto unmask; total_len += sent; @@ -365,7 +383,25 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, } } -uncork: +unmask: + sigprocmask(SIG_SETMASK, &oldmask, NULL); + + /* + * If signal is pending but we have already sent the whole packet to + * the server we need to return success status to allow a corresponding + * mid entry to be kept in the pending requests queue thus allowing + * to handle responses from the server by the client. + * + * If only part of the packet has been sent there is no need to hide + * interrupt because the session will be reconnected anyway, so there + * won't be any response from the server to handle. + */ + + if (signal_pending(current) && (total_len != send_length)) { + cifs_dbg(FYI, "signal is pending after attempt to send\n"); + rc = -EINTR; + } + /* uncork it */ val = 0; kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, @@ -450,29 +486,55 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, } static int -wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, - int *credits) +wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, + const int timeout, const int flags, + unsigned int *instance) { int rc; + int *credits; + int optype; + long int t; + + if (timeout < 0) + t = MAX_JIFFY_OFFSET; + else + t = msecs_to_jiffies(timeout); + + optype = flags & CIFS_OP_MASK; + + *instance = 0; + + credits = server->ops->get_credits_field(server, optype); + /* Since an echo is already inflight, no need to wait to send another */ + if (*credits <= 0 && optype == CIFS_ECHO_OP) + return -EAGAIN; spin_lock(&server->req_lock); - if (timeout == CIFS_ASYNC_OP) { + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) { /* oplock breaks must not be held up */ server->in_flight++; *credits -= 1; + *instance = server->reconnect_instance; spin_unlock(&server->req_lock); return 0; } while (1) { - if (*credits <= 0) { + if (*credits < num_credits) { spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); - rc = wait_event_killable(server->request_q, - has_credits(server, credits)); + rc = wait_event_killable_timeout(server->request_q, + has_credits(server, credits, num_credits), t); cifs_num_waiters_dec(server); - if (rc) - return rc; + if (!rc) { + trace_smb3_credit_timeout(server->CurrentMid, + server->hostname, num_credits); + cifs_dbg(VFS, "wait timed out after %d ms\n", + timeout); + return -ENOTSUPP; + } + if (rc == -ERESTARTSYS) + return -ERESTARTSYS; spin_lock(&server->req_lock); } else { if (server->tcpStatus == CifsExiting) { @@ -481,14 +543,53 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, } /* + * For normal commands, reserve the last MAX_COMPOUND + * credits to compound requests. + * Otherwise these compounds could be permanently + * starved for credits by single-credit requests. + * + * To prevent spinning CPU, block this thread until + * there are >MAX_COMPOUND credits available. + * But only do this is we already have a lot of + * credits in flight to avoid triggering this check + * for servers that are slow to hand out credits on + * new sessions. + */ + if (!optype && num_credits == 1 && + server->in_flight > 2 * MAX_COMPOUND && + *credits <= MAX_COMPOUND) { + spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); + rc = wait_event_killable_timeout( + server->request_q, + has_credits(server, credits, + MAX_COMPOUND + 1), + t); + cifs_num_waiters_dec(server); + if (!rc) { + trace_smb3_credit_timeout( + server->CurrentMid, + server->hostname, num_credits); + cifs_dbg(VFS, "wait timed out after %d ms\n", + timeout); + return -ENOTSUPP; + } + if (rc == -ERESTARTSYS) + return -ERESTARTSYS; + spin_lock(&server->req_lock); + continue; + } + + /* * Can not count locking commands against total * as they are allowed to block on server. */ /* update # of requests on the wire to server */ - if (timeout != CIFS_BLOCKING_OP) { - *credits -= 1; - server->in_flight++; + if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) { + *credits -= num_credits; + server->in_flight += num_credits; + *instance = server->reconnect_instance; } spin_unlock(&server->req_lock); break; @@ -498,24 +599,45 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, } static int -wait_for_free_request(struct TCP_Server_Info *server, const int timeout, - const int optype) +wait_for_free_request(struct TCP_Server_Info *server, const int flags, + unsigned int *instance) { - int *val; + return wait_for_free_credits(server, 1, -1, flags, + instance); +} - val = server->ops->get_credits_field(server, optype); - /* Since an echo is already inflight, no need to wait to send another */ - if (*val <= 0 && optype == CIFS_ECHO_OP) - return -EAGAIN; - return wait_for_free_credits(server, timeout, val); +static int +wait_for_compound_request(struct TCP_Server_Info *server, int num, + const int flags, unsigned int *instance) +{ + int *credits; + + credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK); + + spin_lock(&server->req_lock); + if (*credits < num) { + /* + * Return immediately if not too many requests in flight since + * we will likely be stuck on waiting for credits. + */ + if (server->in_flight < num - *credits) { + spin_unlock(&server->req_lock); + return -ENOTSUPP; + } + } + spin_unlock(&server->req_lock); + + return wait_for_free_credits(server, num, 60000, flags, + instance); } int cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, - unsigned int *num, unsigned int *credits) + unsigned int *num, struct cifs_credits *credits) { *num = size; - *credits = 0; + credits->value = 0; + credits->instance = server->reconnect_instance; return 0; } @@ -602,27 +724,43 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) int cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, mid_receive_t *receive, mid_callback_t *callback, - mid_handle_t *handle, void *cbdata, const int flags) + mid_handle_t *handle, void *cbdata, const int flags, + const struct cifs_credits *exist_credits) { - int rc, timeout, optype; + int rc; struct mid_q_entry *mid; - unsigned int credits = 0; + struct cifs_credits credits = { .value = 0, .instance = 0 }; + unsigned int instance; + int optype; - timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; if ((flags & CIFS_HAS_CREDITS) == 0) { - rc = wait_for_free_request(server, timeout, optype); + rc = wait_for_free_request(server, flags, &instance); if (rc) return rc; - credits = 1; - } + credits.value = 1; + credits.instance = instance; + } else + instance = exist_credits->instance; mutex_lock(&server->srv_mutex); + + /* + * We can't use credits obtained from the previous session to send this + * request. Check if there were reconnects after we obtained credits and + * return -EAGAIN in such cases to let callers handle it. + */ + if (instance != server->reconnect_instance) { + mutex_unlock(&server->srv_mutex); + add_credits_and_wake_if(server, &credits, optype); + return -EAGAIN; + } + mid = server->ops->setup_async_request(server, rqst); if (IS_ERR(mid)) { mutex_unlock(&server->srv_mutex); - add_credits_and_wake_if(server, credits, optype); + add_credits_and_wake_if(server, &credits, optype); return PTR_ERR(mid); } @@ -647,6 +785,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, cifs_in_send_dec(server); if (rc < 0) { + revert_current_mid(server, mid->credits); server->sequence_number -= 2; cifs_delete_mid(mid); } @@ -656,7 +795,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, if (rc == 0) return 0; - add_credits_and_wake_if(server, credits, optype); + add_credits_and_wake_if(server, &credits, optype); return rc; } @@ -786,8 +925,12 @@ static void cifs_compound_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->server; + struct cifs_credits credits; + + credits.value = server->ops->get_credits(mid); + credits.instance = server->reconnect_instance; - add_credits(server, server->ops->get_credits(mid), mid->optype); + add_credits(server, &credits, mid->optype); } static void @@ -809,14 +952,15 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, const int flags, const int num_rqst, struct smb_rqst *rqst, int *resp_buf_type, struct kvec *resp_iov) { - int i, j, rc = 0; - int timeout, optype; + int i, j, optype, rc = 0; struct mid_q_entry *midQ[MAX_COMPOUND]; bool cancelled_mid[MAX_COMPOUND] = {false}; - unsigned int credits[MAX_COMPOUND] = {0}; + struct cifs_credits credits[MAX_COMPOUND] = { + { .value = 0, .instance = 0 } + }; + unsigned int instance; char *buf; - timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; for (i = 0; i < num_rqst; i++) @@ -831,30 +975,21 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -ENOENT; /* - * Ensure we obtain 1 credit per request in the compound chain. - * It can be optimized further by waiting for all the credits - * at once but this can wait long enough if we don't have enough - * credits due to some heavy operations in progress or the server - * not granting us much, so a fallback to the current approach is - * needed anyway. + * Wait for all the requests to become available. + * This approach still leaves the possibility to be stuck waiting for + * credits if the server doesn't grant credits to the outstanding + * requests and if the client is completely idle, not generating any + * other requests. + * This can be handled by the eventual session reconnect. */ + rc = wait_for_compound_request(ses->server, num_rqst, flags, + &instance); + if (rc) + return rc; + for (i = 0; i < num_rqst; i++) { - rc = wait_for_free_request(ses->server, timeout, optype); - if (rc) { - /* - * We haven't sent an SMB packet to the server yet but - * we already obtained credits for i requests in the - * compound chain - need to return those credits back - * for future use. Note that we need to call add_credits - * multiple times to match the way we obtained credits - * in the first place and to account for in flight - * requests correctly. - */ - for (j = 0; j < i; j++) - add_credits(ses->server, 1, optype); - return rc; - } - credits[i] = 1; + credits[i].value = 1; + credits[i].instance = instance; } /* @@ -865,16 +1000,31 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, mutex_lock(&ses->server->srv_mutex); + /* + * All the parts of the compound chain belong obtained credits from the + * same session. We can not use credits obtained from the previous + * session to send this request. Check if there were reconnects after + * we obtained credits and return -EAGAIN in such cases to let callers + * handle it. + */ + if (instance != ses->server->reconnect_instance) { + mutex_unlock(&ses->server->srv_mutex); + for (j = 0; j < num_rqst; j++) + add_credits(ses->server, &credits[j], optype); + return -EAGAIN; + } + for (i = 0; i < num_rqst; i++) { midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]); if (IS_ERR(midQ[i])) { + revert_current_mid(ses->server, i); for (j = 0; j < i; j++) cifs_delete_mid(midQ[j]); mutex_unlock(&ses->server->srv_mutex); /* Update # of requests on wire to server */ for (j = 0; j < num_rqst; j++) - add_credits(ses->server, credits[j], optype); + add_credits(ses->server, &credits[j], optype); return PTR_ERR(midQ[i]); } @@ -897,15 +1047,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (i = 0; i < num_rqst; i++) cifs_save_when_sent(midQ[i]); - if (rc < 0) + if (rc < 0) { + revert_current_mid(ses->server, num_rqst); ses->server->sequence_number -= 2; + } mutex_unlock(&ses->server->srv_mutex); if (rc < 0) { /* Sending failed for some reason - return credits back */ for (i = 0; i < num_rqst; i++) - add_credits(ses->server, credits[i], optype); + add_credits(ses->server, &credits[i], optype); goto out; } @@ -924,7 +1076,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, smb311_update_preauth_hash(ses, rqst[0].rq_iov, rqst[0].rq_nvec); - if (timeout == CIFS_ASYNC_OP) + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) goto out; for (i = 0; i < num_rqst; i++) { @@ -942,7 +1094,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, midQ[i]->mid_flags |= MID_WAIT_CANCELLED; midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; - credits[i] = 0; + credits[i].value = 0; } spin_unlock(&GlobalMid_Lock); } @@ -1061,13 +1213,14 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, int SendReceive(const unsigned int xid, struct cifs_ses *ses, struct smb_hdr *in_buf, struct smb_hdr *out_buf, - int *pbytes_returned, const int timeout) + int *pbytes_returned, const int flags) { int rc = 0; struct mid_q_entry *midQ; unsigned int len = be32_to_cpu(in_buf->smb_buf_length); struct kvec iov = { .iov_base = in_buf, .iov_len = len }; struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; + struct cifs_credits credits = { .value = 1, .instance = 0 }; if (ses == NULL) { cifs_dbg(VFS, "Null smb session\n"); @@ -1091,7 +1244,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - rc = wait_for_free_request(ses->server, timeout, 0); + rc = wait_for_free_request(ses->server, flags, &credits.instance); if (rc) return rc; @@ -1105,7 +1258,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc) { mutex_unlock(&ses->server->srv_mutex); /* Update # of requests on wire to server */ - add_credits(ses->server, 1, 0); + add_credits(ses->server, &credits, 0); return rc; } @@ -1130,7 +1283,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc < 0) goto out; - if (timeout == CIFS_ASYNC_OP) + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) goto out; rc = wait_for_response(ses->server, midQ); @@ -1141,7 +1294,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, /* no longer considered to be "in-flight" */ midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); - add_credits(ses->server, 1, 0); + add_credits(ses->server, &credits, 0); return rc; } spin_unlock(&GlobalMid_Lock); @@ -1149,7 +1302,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = cifs_sync_mid_result(midQ, ses->server); if (rc != 0) { - add_credits(ses->server, 1, 0); + add_credits(ses->server, &credits, 0); return rc; } @@ -1165,7 +1318,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = cifs_check_receive(midQ, ses->server, 0); out: cifs_delete_mid(midQ); - add_credits(ses->server, 1, 0); + add_credits(ses->server, &credits, 0); return rc; } @@ -1207,6 +1360,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, unsigned int len = be32_to_cpu(in_buf->smb_buf_length); struct kvec iov = { .iov_base = in_buf, .iov_len = len }; struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; + unsigned int instance; if (tcon == NULL || tcon->ses == NULL) { cifs_dbg(VFS, "Null smb session\n"); @@ -1232,7 +1386,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return -EIO; } - rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0); + rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, &instance); if (rc) return rc; diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 02b7d91c9231..f0de238000c0 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -1,16 +1,16 @@ config FS_ENCRYPTION - tristate "FS Encryption (Per-file encryption)" + bool "FS Encryption (Per-file encryption)" select CRYPTO select CRYPTO_AES select CRYPTO_CBC select CRYPTO_ECB select CRYPTO_XTS select CRYPTO_CTS - select CRYPTO_CTR select CRYPTO_SHA256 select KEYS help Enable encryption of files and directories. This feature is similar to ecryptfs, but it is more memory efficient since it avoids caching the encrypted and - decrypted pages in the page cache. + decrypted pages in the page cache. Currently Ext4, + F2FS and UBIFS make use of this feature. diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 7424f851eb5c..7da276159593 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -12,7 +12,6 @@ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H -#define __FS_HAS_ENCRYPTION 1 #include <linux/fscrypt.h> #include <crypto/hash.h> diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 926e5df20ec3..56debb1fcf5e 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -58,7 +58,7 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir) return err; if (!fscrypt_has_permitted_context(dir, inode)) - return -EPERM; + return -EXDEV; return 0; } @@ -82,13 +82,13 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_ENCRYPTED(new_dir) && !fscrypt_has_permitted_context(new_dir, d_inode(old_dentry))) - return -EPERM; + return -EXDEV; if ((flags & RENAME_EXCHANGE) && IS_ENCRYPTED(old_dir) && !fscrypt_has_permitted_context(old_dir, d_inode(new_dentry))) - return -EPERM; + return -EXDEV; } return 0; } diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 322ce9686bdb..2cb4956f8511 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -402,7 +402,6 @@ static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) { SHASH_DESC_ON_STACK(desc, tfm); desc->tfm = tfm; - desc->flags = 0; return crypto_shash_digest(desc, key, keysize, salt); } diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index f490de921ce8..bd7eaf9b3f00 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -151,8 +151,7 @@ EXPORT_SYMBOL(fscrypt_ioctl_get_policy); * malicious offline violations of this constraint, while the link and rename * checks are needed to prevent online violations of this constraint. * - * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail - * the filesystem operation with EPERM. + * Return: 1 if permitted, 0 if forbidden. */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { @@ -33,6 +33,7 @@ #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> +#include <asm/pgalloc.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -788,7 +789,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, address = pgoff_address(index, vma); /* - * Note because we provide start/end to follow_pte_pmd it will + * Note because we provide range to follow_pte_pmd it will * call mmu_notifier_invalidate_range_start() on our behalf * before taking any lock. */ @@ -843,9 +844,8 @@ unlock_pte: static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, struct address_space *mapping, void *entry) { - unsigned long pfn; + unsigned long pfn, index, count; long ret = 0; - size_t size; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -894,17 +894,18 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, xas_unlock_irq(xas); /* - * Even if dax_writeback_mapping_range() was given a wbc->range_start - * in the middle of a PMD, the 'index' we are given will be aligned to - * the start index of the PMD, as will the pfn we pull from 'entry'. + * If dax_writeback_mapping_range() was given a wbc->range_start + * in the middle of a PMD, the 'index' we use needs to be + * aligned to the start of the PMD. * This allows us to flush for PMD_SIZE and not have to worry about * partial PMD writebacks. */ pfn = dax_to_pfn(entry); - size = PAGE_SIZE << dax_entry_order(entry); + count = 1UL << dax_entry_order(entry); + index = xas->xa_index & ~(count - 1); - dax_entry_mkclean(mapping, xas->xa_index, pfn); - dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); + dax_entry_mkclean(mapping, index, pfn); + dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -917,8 +918,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); dax_wake_entry(xas, entry, false); - trace_dax_writeback_one(mapping->host, xas->xa_index, - size >> PAGE_SHIFT); + trace_dax_writeback_one(mapping->host, index, count); return ret; put_unlocked: @@ -1220,9 +1220,7 @@ static vm_fault_t dax_fault_return(int error) { if (error == 0) return VM_FAULT_NOPAGE; - if (error == -ENOMEM) - return VM_FAULT_OOM; - return VM_FAULT_SIGBUS; + return vmf_error(error); } /* @@ -1410,7 +1408,9 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; + struct vm_area_struct *vma = vmf->vma; struct inode *inode = mapping->host; + pgtable_t pgtable = NULL; struct page *zero_page; spinlock_t *ptl; pmd_t pmd_entry; @@ -1425,12 +1425,22 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE, false); + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm); + if (!pgtable) + return VM_FAULT_OOM; + } + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { spin_unlock(ptl); goto fallback; } + if (pgtable) { + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + mm_inc_nr_ptes(vma->vm_mm); + } pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); @@ -1439,6 +1449,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, return VM_FAULT_NOPAGE; fallback: + if (pgtable) + pte_free(vma->vm_mm, pgtable); trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); return VM_FAULT_FALLBACK; } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 95b5e78c22b1..f25daa207421 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -163,19 +163,24 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void debugfs_evict_inode(struct inode *inode) +static void debugfs_i_callback(struct rcu_head *head) { - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); + struct inode *inode = container_of(head, struct inode, i_rcu); if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); + free_inode_nonrcu(inode); +} + +static void debugfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, debugfs_i_callback); } static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .remount_fs = debugfs_remount, .show_options = debugfs_show_options, - .evict_inode = debugfs_evict_inode, + .destroy_inode = debugfs_destroy_inode, }; static void debugfs_release_dentry(struct dentry *dentry) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c53814539070..553a3f3300ae 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -455,6 +455,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; s->s_op = &devpts_sops; + s->s_d_op = &simple_dentry_operations; s->s_time_gran = 1; error = -ENOMEM; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f664da55234e..491cf5baa8c2 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -68,7 +68,6 @@ static int ecryptfs_hash_digest(struct crypto_shash *tfm, int err; desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_digest(desc, src, len, dst); shash_desc_zero(desc); return err; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index e74fe84d0886..90fbac5d485b 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -769,7 +769,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, } s->hash_desc->tfm = s->hash_tfm; - s->hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; rc = crypto_shash_digest(s->hash_desc, (u8 *)s->auth_tok->token.password.session_key_encryption_key, diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS deleted file mode 100644 index 1b2d4c63a579..000000000000 --- a/fs/exofs/BUGS +++ /dev/null @@ -1,3 +0,0 @@ -- Out-of-space may cause a severe problem if the object (and directory entry) - were written, but the inode attributes failed. Then if the filesystem was - unmounted and mounted the kernel can get into an endless loop doing a readdir. diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild deleted file mode 100644 index a364fd0965ec..000000000000 --- a/fs/exofs/Kbuild +++ /dev/null @@ -1,20 +0,0 @@ -# -# Kbuild for the EXOFS module -# -# Copyright (C) 2008 Panasas Inc. All rights reserved. -# -# Authors: -# Boaz Harrosh <ooo@electrozaur.com> -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 -# -# Kbuild - Gets included from the Kernels Makefile and build system -# - -# ore module library -libore-y := ore.o ore_raid.o -obj-$(CONFIG_ORE) += libore.o - -exofs-y := inode.o file.o namei.o dir.o super.o sys.o -obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig deleted file mode 100644 index 86194b2f799d..000000000000 --- a/fs/exofs/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -config EXOFS_FS - tristate "exofs: OSD based file system support" - depends on SCSI_OSD_ULD - help - EXOFS is a file system that uses an OSD storage device, - as its backing storage. - -# Debugging-related stuff -config EXOFS_DEBUG - bool "Enable debugging" - depends on EXOFS_FS - help - This option enables EXOFS debug prints. diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore deleted file mode 100644 index 2daf2329c28d..000000000000 --- a/fs/exofs/Kconfig.ore +++ /dev/null @@ -1,14 +0,0 @@ -# ORE - Objects Raid Engine (libore.ko) -# -# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects -# for every ORE user we do it like this. Any user should add itself here -# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are -# selected here, and we default to "ON". So in effect it is like been -# selected by any of the users. -config ORE - tristate - depends on EXOFS_FS || PNFS_OBJLAYOUT - select ASYNC_XOR - select RAID6_PQ - select ASYNC_PQ - default SCSI_OSD_ULD diff --git a/fs/exofs/common.h b/fs/exofs/common.h deleted file mode 100644 index 7d88ef566213..000000000000 --- a/fs/exofs/common.h +++ /dev/null @@ -1,262 +0,0 @@ -/* - * common.h - Common definitions for both Kernel and user-mode utilities - * - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef __EXOFS_COM_H__ -#define __EXOFS_COM_H__ - -#include <linux/types.h> - -#include <scsi/osd_attributes.h> -#include <scsi/osd_initiator.h> -#include <scsi/osd_sec.h> - -/**************************************************************************** - * Object ID related defines - * NOTE: inode# = object ID - EXOFS_OBJ_OFF - ****************************************************************************/ -#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ -#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ -#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ -#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */ -#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ - -/* exofs Application specific page/attribute */ -/* Inode attrs */ -# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) -# define EXOFS_ATTR_INODE_DATA 1 -# define EXOFS_ATTR_INODE_FILE_LAYOUT 2 -# define EXOFS_ATTR_INODE_DIR_LAYOUT 3 -/* Partition attrs */ -# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3) -# define EXOFS_ATTR_SB_STATS 1 - -/* - * The maximum number of files we can have is limited by the size of the - * inode number. This is the largest object ID that the file system supports. - * Object IDs 0, 1, and 2 are always in use (see above defines). - */ -enum { - EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX : - (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)), - EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF), -}; - -/**************************************************************************** - * Misc. - ****************************************************************************/ -#define EXOFS_BLKSHIFT 12 -#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT) - -/**************************************************************************** - * superblock-related things - ****************************************************************************/ -#define EXOFS_SUPER_MAGIC 0x5DF5 - -/* - * The file system control block - stored in object EXOFS_SUPER_ID's data. - * This is where the in-memory superblock is stored on disk. - */ -enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; -struct exofs_fscb { - __le64 s_nextid; /* Only used after mkfs */ - __le64 s_numfiles; /* Only used after mkfs */ - __le32 s_version; /* == EXOFS_FSCB_VER */ - __le16 s_magic; /* Magic signature */ - __le16 s_newfs; /* Non-zero if this is a new fs */ - - /* From here on it's a static part, only written by mkexofs */ - __le64 s_dev_table_oid; /* Resurved, not used */ - __le64 s_dev_table_count; /* == 0 means no dev_table */ -} __packed; - -/* - * This struct is set on the FS partition's attributes. - * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together - * with the create command, to atomically persist the sb writeable information. - */ -struct exofs_sb_stats { - __le64 s_nextid; /* Highest object ID used */ - __le64 s_numfiles; /* Number of files on fs */ -} __packed; - -/* - * Describes the raid used in the FS. It is part of the device table. - * This here is taken from the pNFS-objects definition. In exofs we - * use one raid policy through-out the filesystem. (NOTE: the funny - * alignment at beginning. We take care of it at exofs_device_table. - */ -struct exofs_dt_data_map { - __le32 cb_num_comps; - __le64 cb_stripe_unit; - __le32 cb_group_width; - __le32 cb_group_depth; - __le32 cb_mirror_cnt; - __le32 cb_raid_algorithm; -} __packed; - -/* - * This is an osd device information descriptor. It is a single entry in - * the exofs device table. It describes an osd target lun which - * contains data belonging to this FS. (Same partition_id on all devices) - */ -struct exofs_dt_device_info { - __le32 systemid_len; - u8 systemid[OSD_SYSTEMID_LEN]; - __le64 long_name_offset; /* If !0 then offset-in-file */ - __le32 osdname_len; /* */ - u8 osdname[44]; /* Embbeded, Usually an asci uuid */ -} __packed; - -/* - * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data. - * It contains the raid used for this multy-device FS and an array of - * participating devices. - */ -struct exofs_device_table { - __le32 dt_version; /* == EXOFS_DT_VER */ - struct exofs_dt_data_map dt_data_map; /* Raid policy to use */ - - /* Resurved space For future use. Total includeing this: - * (8 * sizeof(le64)) - */ - __le64 __Resurved[4]; - - __le64 dt_num_devices; /* Array size */ - struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */ -} __packed; - -/**************************************************************************** - * inode-related things - ****************************************************************************/ -#define EXOFS_IDATA 5 - -/* - * The file control block - stored in an object's attributes. This is where - * the in-memory inode is stored on disk. - */ -struct exofs_fcb { - __le64 i_size; /* Size of the file */ - __le16 i_mode; /* File mode */ - __le16 i_links_count; /* Links count */ - __le32 i_uid; /* Owner Uid */ - __le32 i_gid; /* Group Id */ - __le32 i_atime; /* Access time */ - __le32 i_ctime; /* Creation time */ - __le32 i_mtime; /* Modification time */ - __le32 i_flags; /* File flags (unused for now)*/ - __le32 i_generation; /* File version (for NFS) */ - __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */ -}; - -#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb) - -/* This is the Attribute the fcb is stored in */ -static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF( - EXOFS_APAGE_FS_DATA, - EXOFS_ATTR_INODE_DATA, - EXOFS_INO_ATTR_SIZE); - -/**************************************************************************** - * dentry-related things - ****************************************************************************/ -#define EXOFS_NAME_LEN 255 - -/* - * The on-disk directory entry - */ -struct exofs_dir_entry { - __le64 inode_no; /* inode number */ - __le16 rec_len; /* directory entry length */ - u8 name_len; /* name length */ - u8 file_type; /* umm...file type */ - char name[EXOFS_NAME_LEN]; /* file name */ -}; - -enum { - EXOFS_FT_UNKNOWN, - EXOFS_FT_REG_FILE, - EXOFS_FT_DIR, - EXOFS_FT_CHRDEV, - EXOFS_FT_BLKDEV, - EXOFS_FT_FIFO, - EXOFS_FT_SOCK, - EXOFS_FT_SYMLINK, - EXOFS_FT_MAX -}; - -#define EXOFS_DIR_PAD 4 -#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1) -#define EXOFS_DIR_REC_LEN(name_len) \ - (((name_len) + offsetof(struct exofs_dir_entry, name) + \ - EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) - -/* - * The on-disk (optional) layout structure. - * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT - * attribute, attached to any inode, usually to a directory. - */ - -enum exofs_inode_layout_gen_functions { - LAYOUT_MOVING_WINDOW = 0, - LAYOUT_IMPLICT = 1, -}; - -struct exofs_on_disk_inode_layout { - __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */ - __le16 pad; - union { - /* gen_func == LAYOUT_MOVING_WINDOW (default) */ - struct exofs_layout_sliding_window { - __le32 num_devices; /* first n devices in global-table*/ - } sliding_window __packed; - - /* gen_func == LAYOUT_IMPLICT */ - struct exofs_layout_implict_list { - struct exofs_dt_data_map data_map; - /* Variable array of size data_map.cb_num_comps. These - * are device indexes of the devices in the global table - */ - __le32 dev_indexes[]; - } implict __packed; - }; -} __packed; - -static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs) -{ - return sizeof(struct exofs_on_disk_inode_layout) + - max_devs * sizeof(__le32); -} - -#endif /*ifndef __EXOFS_COM_H__*/ diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c deleted file mode 100644 index f0138674c1ed..000000000000 --- a/fs/exofs/dir.c +++ /dev/null @@ -1,661 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/iversion.h> -#include "exofs.h" - -static inline unsigned exofs_chunk_size(struct inode *inode) -{ - return inode->i_sb->s_blocksize; -} - -static inline void exofs_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - -static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) -{ - loff_t last_byte = inode->i_size; - - last_byte -= page_nr << PAGE_SHIFT; - if (last_byte > PAGE_SIZE) - last_byte = PAGE_SIZE; - return last_byte; -} - -static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len) -{ - struct address_space *mapping = page->mapping; - struct inode *dir = mapping->host; - int err = 0; - - inode_inc_iversion(dir); - - if (!PageUptodate(page)) - SetPageUptodate(page); - - if (pos+len > dir->i_size) { - i_size_write(dir, pos+len); - mark_inode_dirty(dir); - } - set_page_dirty(page); - - if (IS_DIRSYNC(dir)) - err = write_one_page(page); - else - unlock_page(page); - - return err; -} - -static bool exofs_check_page(struct page *page) -{ - struct inode *dir = page->mapping->host; - unsigned chunk_size = exofs_chunk_size(dir); - char *kaddr = page_address(page); - unsigned offs, rec_len; - unsigned limit = PAGE_SIZE; - struct exofs_dir_entry *p; - char *error; - - /* if the page is the last one in the directory */ - if ((dir->i_size >> PAGE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_MASK; - if (limit & (chunk_size - 1)) - goto Ebadsize; - if (!limit) - goto out; - } - for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) { - p = (struct exofs_dir_entry *)(kaddr + offs); - rec_len = le16_to_cpu(p->rec_len); - - if (rec_len < EXOFS_DIR_REC_LEN(1)) - goto Eshort; - if (rec_len & 3) - goto Ealign; - if (rec_len < EXOFS_DIR_REC_LEN(p->name_len)) - goto Enamelen; - if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) - goto Espan; - } - if (offs != limit) - goto Eend; -out: - SetPageChecked(page); - return true; - -Ebadsize: - EXOFS_ERR("ERROR [exofs_check_page]: " - "size of directory(0x%lx) is not a multiple of chunk size\n", - dir->i_ino - ); - goto fail; -Eshort: - error = "rec_len is smaller than minimal"; - goto bad_entry; -Ealign: - error = "unaligned directory entry"; - goto bad_entry; -Enamelen: - error = "rec_len is too small for name_len"; - goto bad_entry; -Espan: - error = "directory entry across blocks"; - goto bad_entry; -bad_entry: - EXOFS_ERR( - "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - " - "offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n", - dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, - _LLU(le64_to_cpu(p->inode_no)), - rec_len, p->name_len); - goto fail; -Eend: - p = (struct exofs_dir_entry *)(kaddr + offs); - EXOFS_ERR("ERROR [exofs_check_page]: " - "entry in directory(0x%lx) spans the page boundary" - "offset=%lu, inode=0x%llx\n", - dir->i_ino, (page->index<<PAGE_SHIFT)+offs, - _LLU(le64_to_cpu(p->inode_no))); -fail: - SetPageError(page); - return false; -} - -static struct page *exofs_get_page(struct inode *dir, unsigned long n) -{ - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - - if (!IS_ERR(page)) { - kmap(page); - if (unlikely(!PageChecked(page))) { - if (PageError(page) || !exofs_check_page(page)) - goto fail; - } - } - return page; - -fail: - exofs_put_page(page); - return ERR_PTR(-EIO); -} - -static inline int exofs_match(int len, const unsigned char *name, - struct exofs_dir_entry *de) -{ - if (len != de->name_len) - return 0; - if (!de->inode_no) - return 0; - return !memcmp(name, de->name, len); -} - -static inline -struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p) -{ - return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); -} - -static inline unsigned -exofs_validate_entry(char *base, unsigned offset, unsigned mask) -{ - struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset); - struct exofs_dir_entry *p = - (struct exofs_dir_entry *)(base + (offset&mask)); - while ((char *)p < (char *)de) { - if (p->rec_len == 0) - break; - p = exofs_next_entry(p); - } - return (char *)p - base; -} - -static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = { - [EXOFS_FT_UNKNOWN] = DT_UNKNOWN, - [EXOFS_FT_REG_FILE] = DT_REG, - [EXOFS_FT_DIR] = DT_DIR, - [EXOFS_FT_CHRDEV] = DT_CHR, - [EXOFS_FT_BLKDEV] = DT_BLK, - [EXOFS_FT_FIFO] = DT_FIFO, - [EXOFS_FT_SOCK] = DT_SOCK, - [EXOFS_FT_SYMLINK] = DT_LNK, -}; - -#define S_SHIFT 12 -static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK, -}; - -static inline -void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) -{ - umode_t mode = inode->i_mode; - de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; -} - -static int -exofs_readdir(struct file *file, struct dir_context *ctx) -{ - loff_t pos = ctx->pos; - struct inode *inode = file_inode(file); - unsigned int offset = pos & ~PAGE_MASK; - unsigned long n = pos >> PAGE_SHIFT; - unsigned long npages = dir_pages(inode); - unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); - bool need_revalidate = !inode_eq_iversion(inode, file->f_version); - - if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) - return 0; - - for ( ; n < npages; n++, offset = 0) { - char *kaddr, *limit; - struct exofs_dir_entry *de; - struct page *page = exofs_get_page(inode, n); - - if (IS_ERR(page)) { - EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n", - inode->i_ino); - ctx->pos += PAGE_SIZE - offset; - return PTR_ERR(page); - } - kaddr = page_address(page); - if (unlikely(need_revalidate)) { - if (offset) { - offset = exofs_validate_entry(kaddr, offset, - chunk_mask); - ctx->pos = (n<<PAGE_SHIFT) + offset; - } - file->f_version = inode_query_iversion(inode); - need_revalidate = false; - } - de = (struct exofs_dir_entry *)(kaddr + offset); - limit = kaddr + exofs_last_byte(inode, n) - - EXOFS_DIR_REC_LEN(1); - for (; (char *)de <= limit; de = exofs_next_entry(de)) { - if (de->rec_len == 0) { - EXOFS_ERR("ERROR: " - "zero-length entry in directory(0x%lx)\n", - inode->i_ino); - exofs_put_page(page); - return -EIO; - } - if (de->inode_no) { - unsigned char t; - - if (de->file_type < EXOFS_FT_MAX) - t = exofs_filetype_table[de->file_type]; - else - t = DT_UNKNOWN; - - if (!dir_emit(ctx, de->name, de->name_len, - le64_to_cpu(de->inode_no), - t)) { - exofs_put_page(page); - return 0; - } - } - ctx->pos += le16_to_cpu(de->rec_len); - } - exofs_put_page(page); - } - return 0; -} - -struct exofs_dir_entry *exofs_find_entry(struct inode *dir, - struct dentry *dentry, struct page **res_page) -{ - const unsigned char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - unsigned reclen = EXOFS_DIR_REC_LEN(namelen); - unsigned long start, n; - unsigned long npages = dir_pages(dir); - struct page *page = NULL; - struct exofs_i_info *oi = exofs_i(dir); - struct exofs_dir_entry *de; - - if (npages == 0) - goto out; - - *res_page = NULL; - - start = oi->i_dir_start_lookup; - if (start >= npages) - start = 0; - n = start; - do { - char *kaddr; - page = exofs_get_page(dir, n); - if (!IS_ERR(page)) { - kaddr = page_address(page); - de = (struct exofs_dir_entry *) kaddr; - kaddr += exofs_last_byte(dir, n) - reclen; - while ((char *) de <= kaddr) { - if (de->rec_len == 0) { - EXOFS_ERR("ERROR: zero-length entry in " - "directory(0x%lx)\n", - dir->i_ino); - exofs_put_page(page); - goto out; - } - if (exofs_match(namelen, name, de)) - goto found; - de = exofs_next_entry(de); - } - exofs_put_page(page); - } - if (++n >= npages) - n = 0; - } while (n != start); -out: - return NULL; - -found: - *res_page = page; - oi->i_dir_start_lookup = n; - return de; -} - -struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p) -{ - struct page *page = exofs_get_page(dir, 0); - struct exofs_dir_entry *de = NULL; - - if (!IS_ERR(page)) { - de = exofs_next_entry( - (struct exofs_dir_entry *)page_address(page)); - *p = page; - } - return de; -} - -ino_t exofs_parent_ino(struct dentry *child) -{ - struct page *page; - struct exofs_dir_entry *de; - ino_t ino; - - de = exofs_dotdot(d_inode(child), &page); - if (!de) - return 0; - - ino = le64_to_cpu(de->inode_no); - exofs_put_page(page); - return ino; -} - -ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry) -{ - ino_t res = 0; - struct exofs_dir_entry *de; - struct page *page; - - de = exofs_find_entry(dir, dentry, &page); - if (de) { - res = le64_to_cpu(de->inode_no); - exofs_put_page(page); - } - return res; -} - -int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, - struct page *page, struct inode *inode) -{ - loff_t pos = page_offset(page) + - (char *) de - (char *) page_address(page); - unsigned len = le16_to_cpu(de->rec_len); - int err; - - lock_page(page); - err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL); - if (err) - EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n", - err); - - de->inode_no = cpu_to_le64(inode->i_ino); - exofs_set_de_type(de, inode); - if (likely(!err)) - err = exofs_commit_chunk(page, pos, len); - exofs_put_page(page); - dir->i_mtime = dir->i_ctime = current_time(dir); - mark_inode_dirty(dir); - return err; -} - -int exofs_add_link(struct dentry *dentry, struct inode *inode) -{ - struct inode *dir = d_inode(dentry->d_parent); - const unsigned char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - unsigned chunk_size = exofs_chunk_size(dir); - unsigned reclen = EXOFS_DIR_REC_LEN(namelen); - unsigned short rec_len, name_len; - struct page *page = NULL; - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; - struct exofs_dir_entry *de; - unsigned long npages = dir_pages(dir); - unsigned long n; - char *kaddr; - loff_t pos; - int err; - - for (n = 0; n <= npages; n++) { - char *dir_end; - - page = exofs_get_page(dir, n); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; - lock_page(page); - kaddr = page_address(page); - dir_end = kaddr + exofs_last_byte(dir, n); - de = (struct exofs_dir_entry *)kaddr; - kaddr += PAGE_SIZE - reclen; - while ((char *)de <= kaddr) { - if ((char *)de == dir_end) { - name_len = 0; - rec_len = chunk_size; - de->rec_len = cpu_to_le16(chunk_size); - de->inode_no = 0; - goto got_it; - } - if (de->rec_len == 0) { - EXOFS_ERR("ERROR: exofs_add_link: " - "zero-length entry in directory(0x%lx)\n", - inode->i_ino); - err = -EIO; - goto out_unlock; - } - err = -EEXIST; - if (exofs_match(namelen, name, de)) - goto out_unlock; - name_len = EXOFS_DIR_REC_LEN(de->name_len); - rec_len = le16_to_cpu(de->rec_len); - if (!de->inode_no && rec_len >= reclen) - goto got_it; - if (rec_len >= name_len + reclen) - goto got_it; - de = (struct exofs_dir_entry *) ((char *) de + rec_len); - } - unlock_page(page); - exofs_put_page(page); - } - - EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n", - dentry, inode->i_ino); - return -EINVAL; - -got_it: - pos = page_offset(page) + - (char *)de - (char *)page_address(page); - err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0, - &page, NULL); - if (err) - goto out_unlock; - if (de->inode_no) { - struct exofs_dir_entry *de1 = - (struct exofs_dir_entry *)((char *)de + name_len); - de1->rec_len = cpu_to_le16(rec_len - name_len); - de->rec_len = cpu_to_le16(name_len); - de = de1; - } - de->name_len = namelen; - memcpy(de->name, name, namelen); - de->inode_no = cpu_to_le64(inode->i_ino); - exofs_set_de_type(de, inode); - err = exofs_commit_chunk(page, pos, rec_len); - dir->i_mtime = dir->i_ctime = current_time(dir); - mark_inode_dirty(dir); - sbi->s_numfiles++; - -out_put: - exofs_put_page(page); -out: - return err; -out_unlock: - unlock_page(page); - goto out_put; -} - -int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; - char *kaddr = page_address(page); - unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1); - unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); - loff_t pos; - struct exofs_dir_entry *pde = NULL; - struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from); - int err; - - while (de < dir) { - if (de->rec_len == 0) { - EXOFS_ERR("ERROR: exofs_delete_entry:" - "zero-length entry in directory(0x%lx)\n", - inode->i_ino); - err = -EIO; - goto out; - } - pde = de; - de = exofs_next_entry(de); - } - if (pde) - from = (char *)pde - (char *)page_address(page); - pos = page_offset(page) + from; - lock_page(page); - err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0, - &page, NULL); - if (err) - EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n", - err); - if (pde) - pde->rec_len = cpu_to_le16(to - from); - dir->inode_no = 0; - if (likely(!err)) - err = exofs_commit_chunk(page, pos, to - from); - inode->i_ctime = inode->i_mtime = current_time(inode); - mark_inode_dirty(inode); - sbi->s_numfiles--; -out: - exofs_put_page(page); - return err; -} - -/* kept aligned on 4 bytes */ -#define THIS_DIR ".\0\0" -#define PARENT_DIR "..\0" - -int exofs_make_empty(struct inode *inode, struct inode *parent) -{ - struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); - unsigned chunk_size = exofs_chunk_size(inode); - struct exofs_dir_entry *de; - int err; - void *kaddr; - - if (!page) - return -ENOMEM; - - err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0, - &page, NULL); - if (err) { - unlock_page(page); - goto fail; - } - - kaddr = kmap_atomic(page); - de = (struct exofs_dir_entry *)kaddr; - de->name_len = 1; - de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1)); - memcpy(de->name, THIS_DIR, sizeof(THIS_DIR)); - de->inode_no = cpu_to_le64(inode->i_ino); - exofs_set_de_type(de, inode); - - de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1)); - de->name_len = 2; - de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1)); - de->inode_no = cpu_to_le64(parent->i_ino); - memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); - exofs_set_de_type(de, inode); - kunmap_atomic(kaddr); - err = exofs_commit_chunk(page, 0, chunk_size); -fail: - put_page(page); - return err; -} - -int exofs_empty_dir(struct inode *inode) -{ - struct page *page = NULL; - unsigned long i, npages = dir_pages(inode); - - for (i = 0; i < npages; i++) { - char *kaddr; - struct exofs_dir_entry *de; - page = exofs_get_page(inode, i); - - if (IS_ERR(page)) - continue; - - kaddr = page_address(page); - de = (struct exofs_dir_entry *)kaddr; - kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1); - - while ((char *)de <= kaddr) { - if (de->rec_len == 0) { - EXOFS_ERR("ERROR: exofs_empty_dir: " - "zero-length directory entry" - "kaddr=%p, de=%p\n", kaddr, de); - goto not_empty; - } - if (de->inode_no != 0) { - /* check for . and .. */ - if (de->name[0] != '.') - goto not_empty; - if (de->name_len > 2) - goto not_empty; - if (de->name_len < 2) { - if (le64_to_cpu(de->inode_no) != - inode->i_ino) - goto not_empty; - } else if (de->name[1] != '.') - goto not_empty; - } - de = exofs_next_entry(de); - } - exofs_put_page(page); - } - return 1; - -not_empty: - exofs_put_page(page); - return 0; -} - -const struct file_operations exofs_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .iterate_shared = exofs_readdir, -}; diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h deleted file mode 100644 index 5dc392404559..000000000000 --- a/fs/exofs/exofs.h +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __EXOFS_H__ -#define __EXOFS_H__ - -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/backing-dev.h> -#include <scsi/osd_ore.h> - -#include "common.h" - -#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) - -#ifdef CONFIG_EXOFS_DEBUG -#define EXOFS_DBGMSG(fmt, a...) \ - printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a) -#else -#define EXOFS_DBGMSG(fmt, a...) \ - do { if (0) printk(fmt, ##a); } while (0) -#endif - -/* u64 has problems with printk this will cast it to unsigned long long */ -#define _LLU(x) (unsigned long long)(x) - -struct exofs_dev { - struct ore_dev ored; - unsigned did; - unsigned urilen; - uint8_t *uri; - struct kobject ed_kobj; -}; -/* - * our extension to the in-memory superblock - */ -struct exofs_sb_info { - struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ - int s_timeout; /* timeout for OSD operations */ - uint64_t s_nextid; /* highest object ID used */ - uint32_t s_numfiles; /* number of files on fs */ - spinlock_t s_next_gen_lock; /* spinlock for gen # update */ - u32 s_next_generation; /* next gen # to use */ - atomic_t s_curr_pending; /* number of pending commands */ - - struct ore_layout layout; /* Default files layout */ - struct ore_comp one_comp; /* id & cred of partition id=0*/ - struct ore_components oc; /* comps for the partition */ - struct kobject s_kobj; /* holds per-sbi kobject */ -}; - -/* - * our extension to the in-memory inode - */ -struct exofs_i_info { - struct inode vfs_inode; /* normal in-memory inode */ - wait_queue_head_t i_wq; /* wait queue for inode */ - unsigned long i_flags; /* various atomic flags */ - uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ - uint32_t i_dir_start_lookup; /* which page to start lookup */ - uint64_t i_commit_size; /* the object's written length */ - struct ore_comp one_comp; /* same component for all devices */ - struct ore_components oc; /* inode view of the device table */ -}; - -static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) -{ - return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; -} - -/* - * our inode flags - */ -#define OBJ_2BCREATED 0 /* object will be created soon*/ -#define OBJ_CREATED 1 /* object has been created on the osd*/ - -static inline int obj_2bcreated(struct exofs_i_info *oi) -{ - return test_bit(OBJ_2BCREATED, &oi->i_flags); -} - -static inline void set_obj_2bcreated(struct exofs_i_info *oi) -{ - set_bit(OBJ_2BCREATED, &oi->i_flags); -} - -static inline int obj_created(struct exofs_i_info *oi) -{ - return test_bit(OBJ_CREATED, &oi->i_flags); -} - -static inline void set_obj_created(struct exofs_i_info *oi) -{ - set_bit(OBJ_CREATED, &oi->i_flags); -} - -int __exofs_wait_obj_created(struct exofs_i_info *oi); -static inline int wait_obj_created(struct exofs_i_info *oi) -{ - if (likely(obj_created(oi))) - return 0; - - return __exofs_wait_obj_created(oi); -} - -/* - * get to our inode from the vfs inode - */ -static inline struct exofs_i_info *exofs_i(struct inode *inode) -{ - return container_of(inode, struct exofs_i_info, vfs_inode); -} - -/* - * Maximum count of links to a file - */ -#define EXOFS_LINK_MAX 32000 - -/************************* - * function declarations * - *************************/ - -/* inode.c */ -unsigned exofs_max_io_pages(struct ore_layout *layout, - unsigned expected_pages); -int exofs_setattr(struct dentry *, struct iattr *); -int exofs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata); -extern struct inode *exofs_iget(struct super_block *, unsigned long); -struct inode *exofs_new_inode(struct inode *, umode_t); -extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); -extern void exofs_evict_inode(struct inode *); - -/* dir.c: */ -int exofs_add_link(struct dentry *, struct inode *); -ino_t exofs_inode_by_name(struct inode *, struct dentry *); -int exofs_delete_entry(struct exofs_dir_entry *, struct page *); -int exofs_make_empty(struct inode *, struct inode *); -struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *, - struct page **); -int exofs_empty_dir(struct inode *); -struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **); -ino_t exofs_parent_ino(struct dentry *child); -int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, - struct inode *); - -/* super.c */ -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], - const struct osd_obj_id *obj); -int exofs_sbi_write_stats(struct exofs_sb_info *sbi); - -/* sys.c */ -int exofs_sysfs_init(void); -void exofs_sysfs_uninit(void); -int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, - struct exofs_dt_device_info *dt_dev); -void exofs_sysfs_sb_del(struct exofs_sb_info *sbi); -int exofs_sysfs_odev_add(struct exofs_dev *edev, - struct exofs_sb_info *sbi); -void exofs_sysfs_dbg_print(void); - -/********************* - * operation vectors * - *********************/ -/* dir.c: */ -extern const struct file_operations exofs_dir_operations; - -/* file.c */ -extern const struct inode_operations exofs_file_inode_operations; -extern const struct file_operations exofs_file_operations; - -/* inode.c */ -extern const struct address_space_operations exofs_aops; - -/* namei.c */ -extern const struct inode_operations exofs_dir_inode_operations; -extern const struct inode_operations exofs_special_inode_operations; - -/* exofs_init_comps will initialize an ore_components device array - * pointing to a single ore_comp struct, and a round-robin view - * of the device table. - * The first device of each inode is the [inode->ino % num_devices] - * and the rest of the devices sequentially following where the - * first device is after the last device. - * It is assumed that the global device array at @sbi is twice - * bigger and that the device table repeats twice. - * See: exofs_read_lookup_dev_table() - */ -static inline void exofs_init_comps(struct ore_components *oc, - struct ore_comp *one_comp, - struct exofs_sb_info *sbi, osd_id oid) -{ - unsigned dev_mod = (unsigned)oid, first_dev; - - one_comp->obj.partition = sbi->one_comp.obj.partition; - one_comp->obj.id = oid; - exofs_make_credential(one_comp->cred, &one_comp->obj); - - oc->first_dev = 0; - oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * - sbi->layout.group_count; - oc->single_comp = EC_SINGLE_COMP; - oc->comps = one_comp; - - /* Round robin device view of the table */ - first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; - oc->ods = &sbi->oc.ods[first_dev]; -} - -#endif diff --git a/fs/exofs/file.c b/fs/exofs/file.c deleted file mode 100644 index a94594ea2aa3..000000000000 --- a/fs/exofs/file.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "exofs.h" - -static int exofs_release_file(struct inode *inode, struct file *filp) -{ - return 0; -} - -/* exofs_file_fsync - flush the inode to disk - * - * Note, in exofs all metadata is written as part of inode, regardless. - * The writeout is synchronous - */ -static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = filp->f_mapping->host; - int ret; - - ret = file_write_and_wait_range(filp, start, end); - if (ret) - return ret; - - inode_lock(inode); - ret = sync_inode_metadata(filp->f_mapping->host, 1); - inode_unlock(inode); - return ret; -} - -static int exofs_flush(struct file *file, fl_owner_t id) -{ - int ret = vfs_fsync(file, 0); - /* TODO: Flush the OSD target */ - return ret; -} - -const struct file_operations exofs_file_operations = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, - .open = generic_file_open, - .release = exofs_release_file, - .fsync = exofs_file_fsync, - .flush = exofs_flush, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, -}; - -const struct inode_operations exofs_file_inode_operations = { - .setattr = exofs_setattr, -}; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c deleted file mode 100644 index 5f81fcd383a4..000000000000 --- a/fs/exofs/inode.c +++ /dev/null @@ -1,1514 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/slab.h> - -#include "exofs.h" - -#define EXOFS_DBGMSG2(M...) do {} while (0) - -unsigned exofs_max_io_pages(struct ore_layout *layout, - unsigned expected_pages) -{ - unsigned pages = min_t(unsigned, expected_pages, - layout->max_io_length / PAGE_SIZE); - - return pages; -} - -struct page_collect { - struct exofs_sb_info *sbi; - struct inode *inode; - unsigned expected_pages; - struct ore_io_state *ios; - - struct page **pages; - unsigned alloc_pages; - unsigned nr_pages; - unsigned long length; - loff_t pg_first; /* keep 64bit also in 32-arches */ - bool read_4_write; /* This means two things: that the read is sync - * And the pages should not be unlocked. - */ - struct page *that_locked_page; -}; - -static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, - struct inode *inode) -{ - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; - - pcol->sbi = sbi; - pcol->inode = inode; - pcol->expected_pages = expected_pages; - - pcol->ios = NULL; - pcol->pages = NULL; - pcol->alloc_pages = 0; - pcol->nr_pages = 0; - pcol->length = 0; - pcol->pg_first = -1; - pcol->read_4_write = false; - pcol->that_locked_page = NULL; -} - -static void _pcol_reset(struct page_collect *pcol) -{ - pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); - - pcol->pages = NULL; - pcol->alloc_pages = 0; - pcol->nr_pages = 0; - pcol->length = 0; - pcol->pg_first = -1; - pcol->ios = NULL; - pcol->that_locked_page = NULL; - - /* this is probably the end of the loop but in writes - * it might not end here. don't be left with nothing - */ - if (!pcol->expected_pages) - pcol->expected_pages = - exofs_max_io_pages(&pcol->sbi->layout, ~0); -} - -static int pcol_try_alloc(struct page_collect *pcol) -{ - unsigned pages; - - /* TODO: easily support bio chaining */ - pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); - - for (; pages; pages >>= 1) { - pcol->pages = kmalloc_array(pages, sizeof(struct page *), - GFP_KERNEL); - if (likely(pcol->pages)) { - pcol->alloc_pages = pages; - return 0; - } - } - - EXOFS_ERR("Failed to kmalloc expected_pages=%u\n", - pcol->expected_pages); - return -ENOMEM; -} - -static void pcol_free(struct page_collect *pcol) -{ - kfree(pcol->pages); - pcol->pages = NULL; - - if (pcol->ios) { - ore_put_io_state(pcol->ios); - pcol->ios = NULL; - } -} - -static int pcol_add_page(struct page_collect *pcol, struct page *page, - unsigned len) -{ - if (unlikely(pcol->nr_pages >= pcol->alloc_pages)) - return -ENOMEM; - - pcol->pages[pcol->nr_pages++] = page; - pcol->length += len; - return 0; -} - -enum {PAGE_WAS_NOT_IN_IO = 17}; -static int update_read_page(struct page *page, int ret) -{ - switch (ret) { - case 0: - /* Everything is OK */ - SetPageUptodate(page); - if (PageError(page)) - ClearPageError(page); - break; - case -EFAULT: - /* In this case we were trying to read something that wasn't on - * disk yet - return a page full of zeroes. This should be OK, - * because the object should be empty (if there was a write - * before this read, the read would be waiting with the page - * locked */ - clear_highpage(page); - - SetPageUptodate(page); - if (PageError(page)) - ClearPageError(page); - EXOFS_DBGMSG("recovered read error\n"); - /* fall through */ - case PAGE_WAS_NOT_IN_IO: - ret = 0; /* recovered error */ - break; - default: - SetPageError(page); - } - return ret; -} - -static void update_write_page(struct page *page, int ret) -{ - if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) - return; /* don't pass start don't collect $200 */ - - if (ret) { - mapping_set_error(page->mapping, ret); - SetPageError(page); - } - end_page_writeback(page); -} - -/* Called at the end of reads, to optionally unlock pages and update their - * status. - */ -static int __readpages_done(struct page_collect *pcol) -{ - int i; - u64 good_bytes; - u64 length = 0; - int ret = ore_check_io(pcol->ios, NULL); - - if (likely(!ret)) { - good_bytes = pcol->length; - ret = PAGE_WAS_NOT_IN_IO; - } else { - good_bytes = 0; - } - - EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" - " length=0x%lx nr_pages=%u\n", - pcol->inode->i_ino, _LLU(good_bytes), pcol->length, - pcol->nr_pages); - - for (i = 0; i < pcol->nr_pages; i++) { - struct page *page = pcol->pages[i]; - struct inode *inode = page->mapping->host; - int page_stat; - - if (inode != pcol->inode) - continue; /* osd might add more pages at end */ - - if (likely(length < good_bytes)) - page_stat = 0; - else - page_stat = ret; - - EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n", - inode->i_ino, page->index, - page_stat ? "bad_bytes" : "good_bytes"); - - ret = update_read_page(page, page_stat); - if (!pcol->read_4_write) - unlock_page(page); - length += PAGE_SIZE; - } - - pcol_free(pcol); - EXOFS_DBGMSG2("readpages_done END\n"); - return ret; -} - -/* callback of async reads */ -static void readpages_done(struct ore_io_state *ios, void *p) -{ - struct page_collect *pcol = p; - - __readpages_done(pcol); - atomic_dec(&pcol->sbi->s_curr_pending); - kfree(pcol); -} - -static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) -{ - int i; - - for (i = 0; i < pcol->nr_pages; i++) { - struct page *page = pcol->pages[i]; - - if (rw == READ) - update_read_page(page, ret); - else - update_write_page(page, ret); - - unlock_page(page); - } -} - -static int _maybe_not_all_in_one_io(struct ore_io_state *ios, - struct page_collect *pcol_src, struct page_collect *pcol) -{ - /* length was wrong or offset was not page aligned */ - BUG_ON(pcol_src->nr_pages < ios->nr_pages); - - if (pcol_src->nr_pages > ios->nr_pages) { - struct page **src_page; - unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; - unsigned long len_less = pcol_src->length - ios->length; - unsigned i; - int ret; - - /* This IO was trimmed */ - pcol_src->nr_pages = ios->nr_pages; - pcol_src->length = ios->length; - - /* Left over pages are passed to the next io */ - pcol->expected_pages += pages_less; - pcol->nr_pages = pages_less; - pcol->length = len_less; - src_page = pcol_src->pages + pcol_src->nr_pages; - pcol->pg_first = (*src_page)->index; - - ret = pcol_try_alloc(pcol); - if (unlikely(ret)) - return ret; - - for (i = 0; i < pages_less; ++i) - pcol->pages[i] = *src_page++; - - EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " - "pages_less=0x%x expected_pages=0x%x " - "next_offset=0x%llx next_len=0x%lx\n", - pcol_src->nr_pages, pages_less, pcol->expected_pages, - pcol->pg_first * PAGE_SIZE, pcol->length); - } - return 0; -} - -static int read_exec(struct page_collect *pcol) -{ - struct exofs_i_info *oi = exofs_i(pcol->inode); - struct ore_io_state *ios; - struct page_collect *pcol_copy = NULL; - int ret; - - if (!pcol->pages) - return 0; - - if (!pcol->ios) { - int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, - pcol->pg_first << PAGE_SHIFT, - pcol->length, &pcol->ios); - - if (ret) - return ret; - } - - ios = pcol->ios; - ios->pages = pcol->pages; - - if (pcol->read_4_write) { - ore_read(pcol->ios); - return __readpages_done(pcol); - } - - pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); - if (!pcol_copy) { - ret = -ENOMEM; - goto err; - } - - *pcol_copy = *pcol; - ios->done = readpages_done; - ios->private = pcol_copy; - - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); - - ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); - if (unlikely(ret)) - goto err; - - EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", - pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); - - ret = ore_read(ios); - if (unlikely(ret)) - goto err; - - atomic_inc(&pcol->sbi->s_curr_pending); - - return 0; - -err: - if (!pcol_copy) /* Failed before ownership transfer */ - pcol_copy = pcol; - _unlock_pcol_pages(pcol_copy, ret, READ); - pcol_free(pcol_copy); - kfree(pcol_copy); - - return ret; -} - -/* readpage_strip is called either directly from readpage() or by the VFS from - * within read_cache_pages(), to add one more page to be read. It will try to - * collect as many contiguous pages as posible. If a discontinuity is - * encountered, or it runs out of resources, it will submit the previous segment - * and will start a new collection. Eventually caller must submit the last - * segment if present. - */ -static int readpage_strip(void *data, struct page *page) -{ - struct page_collect *pcol = data; - struct inode *inode = pcol->inode; - struct exofs_i_info *oi = exofs_i(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; - size_t len; - int ret; - - BUG_ON(!PageLocked(page)); - - /* FIXME: Just for debugging, will be removed */ - if (PageUptodate(page)) - EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, - page->index); - - pcol->that_locked_page = page; - - if (page->index < end_index) - len = PAGE_SIZE; - else if (page->index == end_index) - len = i_size & ~PAGE_MASK; - else - len = 0; - - if (!len || !obj_created(oi)) { - /* this will be out of bounds, or doesn't exist yet. - * Current page is cleared and the request is split - */ - clear_highpage(page); - - SetPageUptodate(page); - if (PageError(page)) - ClearPageError(page); - - if (!pcol->read_4_write) - unlock_page(page); - EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx " - "read_4_write=%d index=0x%lx end_index=0x%lx " - "splitting\n", inode->i_ino, len, - pcol->read_4_write, page->index, end_index); - - return read_exec(pcol); - } - -try_again: - - if (unlikely(pcol->pg_first == -1)) { - pcol->pg_first = page->index; - } else if (unlikely((pcol->pg_first + pcol->nr_pages) != - page->index)) { - /* Discontinuity detected, split the request */ - ret = read_exec(pcol); - if (unlikely(ret)) - goto fail; - goto try_again; - } - - if (!pcol->pages) { - ret = pcol_try_alloc(pcol); - if (unlikely(ret)) - goto fail; - } - - if (len != PAGE_SIZE) - zero_user(page, len, PAGE_SIZE - len); - - EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", - inode->i_ino, page->index, len); - - ret = pcol_add_page(pcol, page, len); - if (ret) { - EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p " - "this_len=0x%zx nr_pages=%u length=0x%lx\n", - page, len, pcol->nr_pages, pcol->length); - - /* split the request, and start again with current page */ - ret = read_exec(pcol); - if (unlikely(ret)) - goto fail; - - goto try_again; - } - - return 0; - -fail: - /* SetPageError(page); ??? */ - unlock_page(page); - return ret; -} - -static int exofs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct page_collect pcol; - int ret; - - _pcol_init(&pcol, nr_pages, mapping->host); - - ret = read_cache_pages(mapping, pages, readpage_strip, &pcol); - if (ret) { - EXOFS_ERR("read_cache_pages => %d\n", ret); - return ret; - } - - ret = read_exec(&pcol); - if (unlikely(ret)) - return ret; - - return read_exec(&pcol); -} - -static int _readpage(struct page *page, bool read_4_write) -{ - struct page_collect pcol; - int ret; - - _pcol_init(&pcol, 1, page->mapping->host); - - pcol.read_4_write = read_4_write; - ret = readpage_strip(&pcol, page); - if (ret) { - EXOFS_ERR("_readpage => %d\n", ret); - return ret; - } - - return read_exec(&pcol); -} - -/* - * We don't need the file - */ -static int exofs_readpage(struct file *file, struct page *page) -{ - return _readpage(page, false); -} - -/* Callback for osd_write. All writes are asynchronous */ -static void writepages_done(struct ore_io_state *ios, void *p) -{ - struct page_collect *pcol = p; - int i; - u64 good_bytes; - u64 length = 0; - int ret = ore_check_io(ios, NULL); - - atomic_dec(&pcol->sbi->s_curr_pending); - - if (likely(!ret)) { - good_bytes = pcol->length; - ret = PAGE_WAS_NOT_IN_IO; - } else { - good_bytes = 0; - } - - EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" - " length=0x%lx nr_pages=%u\n", - pcol->inode->i_ino, _LLU(good_bytes), pcol->length, - pcol->nr_pages); - - for (i = 0; i < pcol->nr_pages; i++) { - struct page *page = pcol->pages[i]; - struct inode *inode = page->mapping->host; - int page_stat; - - if (inode != pcol->inode) - continue; /* osd might add more pages to a bio */ - - if (likely(length < good_bytes)) - page_stat = 0; - else - page_stat = ret; - - update_write_page(page, page_stat); - unlock_page(page); - EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", - inode->i_ino, page->index, page_stat); - - length += PAGE_SIZE; - } - - pcol_free(pcol); - kfree(pcol); - EXOFS_DBGMSG2("writepages_done END\n"); -} - -static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) -{ - struct page_collect *pcol = priv; - pgoff_t index = offset / PAGE_SIZE; - - if (!pcol->that_locked_page || - (pcol->that_locked_page->index != index)) { - struct page *page; - loff_t i_size = i_size_read(pcol->inode); - - if (offset >= i_size) { - *uptodate = true; - EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index); - return ZERO_PAGE(0); - } - - page = find_get_page(pcol->inode->i_mapping, index); - if (!page) { - page = find_or_create_page(pcol->inode->i_mapping, - index, GFP_NOFS); - if (unlikely(!page)) { - EXOFS_DBGMSG("grab_cache_page Failed " - "index=0x%llx\n", _LLU(index)); - return NULL; - } - unlock_page(page); - } - *uptodate = PageUptodate(page); - EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); - return page; - } else { - EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n", - pcol->that_locked_page->index); - *uptodate = true; - return pcol->that_locked_page; - } -} - -static void __r4w_put_page(void *priv, struct page *page) -{ - struct page_collect *pcol = priv; - - if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { - EXOFS_DBGMSG2("index=0x%lx\n", page->index); - put_page(page); - return; - } - EXOFS_DBGMSG2("that_locked_page index=0x%lx\n", - ZERO_PAGE(0) == page ? -1 : page->index); -} - -static const struct _ore_r4w_op _r4w_op = { - .get_page = &__r4w_get_page, - .put_page = &__r4w_put_page, -}; - -static int write_exec(struct page_collect *pcol) -{ - struct exofs_i_info *oi = exofs_i(pcol->inode); - struct ore_io_state *ios; - struct page_collect *pcol_copy = NULL; - int ret; - - if (!pcol->pages) - return 0; - - BUG_ON(pcol->ios); - ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, - pcol->pg_first << PAGE_SHIFT, - pcol->length, &pcol->ios); - if (unlikely(ret)) - goto err; - - pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); - if (!pcol_copy) { - EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); - ret = -ENOMEM; - goto err; - } - - *pcol_copy = *pcol; - - ios = pcol->ios; - ios->pages = pcol_copy->pages; - ios->done = writepages_done; - ios->r4w = &_r4w_op; - ios->private = pcol_copy; - - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); - - ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); - if (unlikely(ret)) - goto err; - - EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", - pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); - - ret = ore_write(ios); - if (unlikely(ret)) { - EXOFS_ERR("write_exec: ore_write() Failed\n"); - goto err; - } - - atomic_inc(&pcol->sbi->s_curr_pending); - return 0; - -err: - if (!pcol_copy) /* Failed before ownership transfer */ - pcol_copy = pcol; - _unlock_pcol_pages(pcol_copy, ret, WRITE); - pcol_free(pcol_copy); - kfree(pcol_copy); - - return ret; -} - -/* writepage_strip is called either directly from writepage() or by the VFS from - * within write_cache_pages(), to add one more page to be written to storage. - * It will try to collect as many contiguous pages as possible. If a - * discontinuity is encountered or it runs out of resources it will submit the - * previous segment and will start a new collection. - * Eventually caller must submit the last segment if present. - */ -static int writepage_strip(struct page *page, - struct writeback_control *wbc_unused, void *data) -{ - struct page_collect *pcol = data; - struct inode *inode = pcol->inode; - struct exofs_i_info *oi = exofs_i(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; - size_t len; - int ret; - - BUG_ON(!PageLocked(page)); - - ret = wait_obj_created(oi); - if (unlikely(ret)) - goto fail; - - if (page->index < end_index) - /* in this case, the page is within the limits of the file */ - len = PAGE_SIZE; - else { - len = i_size & ~PAGE_MASK; - - if (page->index > end_index || !len) { - /* in this case, the page is outside the limits - * (truncate in progress) - */ - ret = write_exec(pcol); - if (unlikely(ret)) - goto fail; - if (PageError(page)) - ClearPageError(page); - unlock_page(page); - EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) " - "outside the limits\n", - inode->i_ino, page->index); - return 0; - } - } - -try_again: - - if (unlikely(pcol->pg_first == -1)) { - pcol->pg_first = page->index; - } else if (unlikely((pcol->pg_first + pcol->nr_pages) != - page->index)) { - /* Discontinuity detected, split the request */ - ret = write_exec(pcol); - if (unlikely(ret)) - goto fail; - - EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n", - inode->i_ino, page->index); - goto try_again; - } - - if (!pcol->pages) { - ret = pcol_try_alloc(pcol); - if (unlikely(ret)) - goto fail; - } - - EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", - inode->i_ino, page->index, len); - - ret = pcol_add_page(pcol, page, len); - if (unlikely(ret)) { - EXOFS_DBGMSG2("Failed pcol_add_page " - "nr_pages=%u total_length=0x%lx\n", - pcol->nr_pages, pcol->length); - - /* split the request, next loop will start again */ - ret = write_exec(pcol); - if (unlikely(ret)) { - EXOFS_DBGMSG("write_exec failed => %d", ret); - goto fail; - } - - goto try_again; - } - - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - - return 0; - -fail: - EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", - inode->i_ino, page->index, ret); - mapping_set_error(page->mapping, -EIO); - unlock_page(page); - return ret; -} - -static int exofs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct page_collect pcol; - long start, end, expected_pages; - int ret; - - start = wbc->range_start >> PAGE_SHIFT; - end = (wbc->range_end == LLONG_MAX) ? - start + mapping->nrpages : - wbc->range_end >> PAGE_SHIFT; - - if (start || end) - expected_pages = end - start + 1; - else - expected_pages = mapping->nrpages; - - if (expected_pages < 32L) - expected_pages = 32L; - - EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " - "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", - mapping->host->i_ino, wbc->range_start, wbc->range_end, - mapping->nrpages, start, end, expected_pages); - - _pcol_init(&pcol, expected_pages, mapping->host); - - ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); - if (unlikely(ret)) { - EXOFS_ERR("write_cache_pages => %d\n", ret); - return ret; - } - - ret = write_exec(&pcol); - if (unlikely(ret)) - return ret; - - if (wbc->sync_mode == WB_SYNC_ALL) { - return write_exec(&pcol); /* pump the last reminder */ - } else if (pcol.nr_pages) { - /* not SYNC let the reminder join the next writeout */ - unsigned i; - - for (i = 0; i < pcol.nr_pages; i++) { - struct page *page = pcol.pages[i]; - - end_page_writeback(page); - set_page_dirty(page); - unlock_page(page); - } - } - return 0; -} - -/* -static int exofs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct page_collect pcol; - int ret; - - _pcol_init(&pcol, 1, page->mapping->host); - - ret = writepage_strip(page, NULL, &pcol); - if (ret) { - EXOFS_ERR("exofs_writepage => %d\n", ret); - return ret; - } - - return write_exec(&pcol); -} -*/ -/* i_mutex held using inode->i_size directly */ -static void _write_failed(struct inode *inode, loff_t to) -{ - if (to > inode->i_size) - truncate_pagecache(inode, inode->i_size); -} - -int exofs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - int ret = 0; - struct page *page; - - page = *pagep; - if (page == NULL) { - page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT, - flags); - if (!page) { - EXOFS_DBGMSG("grab_cache_page_write_begin failed\n"); - return -ENOMEM; - } - *pagep = page; - } - - /* read modify write */ - if (!PageUptodate(page) && (len != PAGE_SIZE)) { - loff_t i_size = i_size_read(mapping->host); - pgoff_t end_index = i_size >> PAGE_SHIFT; - - if (page->index > end_index) { - clear_highpage(page); - SetPageUptodate(page); - } else { - ret = _readpage(page, true); - if (ret) { - unlock_page(page); - EXOFS_DBGMSG("__readpage failed\n"); - } - } - } - return ret; -} - -static int exofs_write_begin_export(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - *pagep = NULL; - - return exofs_write_begin(file, mapping, pos, len, flags, pagep, - fsdata); -} - -static int exofs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = mapping->host; - loff_t last_pos = pos + copied; - - if (!PageUptodate(page)) { - if (copied < len) { - _write_failed(inode, pos + len); - copied = 0; - goto out; - } - SetPageUptodate(page); - } - if (last_pos > inode->i_size) { - i_size_write(inode, last_pos); - mark_inode_dirty(inode); - } - set_page_dirty(page); -out: - unlock_page(page); - put_page(page); - return copied; -} - -static int exofs_releasepage(struct page *page, gfp_t gfp) -{ - EXOFS_DBGMSG("page 0x%lx\n", page->index); - WARN_ON(1); - return 0; -} - -static void exofs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n", - page->index, offset, length); - WARN_ON(1); -} - - - /* TODO: Should be easy enough to do proprly */ -static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - return 0; -} - -const struct address_space_operations exofs_aops = { - .readpage = exofs_readpage, - .readpages = exofs_readpages, - .writepage = NULL, - .writepages = exofs_writepages, - .write_begin = exofs_write_begin_export, - .write_end = exofs_write_end, - .releasepage = exofs_releasepage, - .set_page_dirty = __set_page_dirty_nobuffers, - .invalidatepage = exofs_invalidatepage, - - /* Not implemented Yet */ - .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ - .direct_IO = exofs_direct_IO, - - /* With these NULL has special meaning or default is not exported */ - .migratepage = NULL, - .launder_page = NULL, - .is_partially_uptodate = NULL, - .error_remove_page = NULL, -}; - -/****************************************************************************** - * INODE OPERATIONS - *****************************************************************************/ - -/* - * Test whether an inode is a fast symlink. - */ -static inline int exofs_inode_is_fast_symlink(struct inode *inode) -{ - struct exofs_i_info *oi = exofs_i(inode); - - return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); -} - -static int _do_truncate(struct inode *inode, loff_t newsize) -{ - struct exofs_i_info *oi = exofs_i(inode); - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; - int ret; - - inode->i_mtime = inode->i_ctime = current_time(inode); - - ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); - if (likely(!ret)) - truncate_setsize(inode, newsize); - - EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n", - inode->i_ino, newsize, ret); - return ret; -} - -/* - * Set inode attributes - update size attribute on OSD if needed, - * otherwise just call generic functions. - */ -int exofs_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = d_inode(dentry); - int error; - - /* if we are about to modify an object, and it hasn't been - * created yet, wait - */ - error = wait_obj_created(exofs_i(inode)); - if (unlikely(error)) - return error; - - error = setattr_prepare(dentry, iattr); - if (unlikely(error)) - return error; - - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { - error = _do_truncate(inode, iattr->ia_size); - if (unlikely(error)) - return error; - } - - setattr_copy(inode, iattr); - mark_inode_dirty(inode); - return 0; -} - -static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( - EXOFS_APAGE_FS_DATA, - EXOFS_ATTR_INODE_FILE_LAYOUT, - 0); -static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( - EXOFS_APAGE_FS_DATA, - EXOFS_ATTR_INODE_DIR_LAYOUT, - 0); - -/* - * Read the Linux inode info from the OSD, and return it as is. In exofs the - * inode info is in an application specific page/attribute of the osd-object. - */ -static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, - struct exofs_fcb *inode) -{ - struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_attr attrs[] = { - [0] = g_attr_inode_data, - [1] = g_attr_inode_file_layout, - [2] = g_attr_inode_dir_layout, - }; - struct ore_io_state *ios; - struct exofs_on_disk_inode_layout *layout; - int ret; - - ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); - return ret; - } - - attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); - attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); - - ios->in_attr = attrs; - ios->in_attr_len = ARRAY_SIZE(attrs); - - ret = ore_read(ios); - if (unlikely(ret)) { - EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", - _LLU(oi->one_comp.obj.id), ret); - memset(inode, 0, sizeof(*inode)); - inode->i_mode = 0040000 | (0777 & ~022); - /* If object is lost on target we might as well enable it's - * delete. - */ - ret = 0; - goto out; - } - - ret = extract_attr_from_ios(ios, &attrs[0]); - if (ret) { - EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__); - goto out; - } - WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); - memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); - - ret = extract_attr_from_ios(ios, &attrs[1]); - if (ret) { - EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__); - goto out; - } - if (attrs[1].len) { - layout = attrs[1].val_ptr; - if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { - EXOFS_ERR("%s: unsupported files layout %d\n", - __func__, layout->gen_func); - ret = -ENOTSUPP; - goto out; - } - } - - ret = extract_attr_from_ios(ios, &attrs[2]); - if (ret) { - EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__); - goto out; - } - if (attrs[2].len) { - layout = attrs[2].val_ptr; - if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { - EXOFS_ERR("%s: unsupported meta-data layout %d\n", - __func__, layout->gen_func); - ret = -ENOTSUPP; - goto out; - } - } - -out: - ore_put_io_state(ios); - return ret; -} - -static void __oi_init(struct exofs_i_info *oi) -{ - init_waitqueue_head(&oi->i_wq); - oi->i_flags = 0; -} -/* - * Fill in an inode read from the OSD and set it up for use - */ -struct inode *exofs_iget(struct super_block *sb, unsigned long ino) -{ - struct exofs_i_info *oi; - struct exofs_fcb fcb; - struct inode *inode; - int ret; - - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - oi = exofs_i(inode); - __oi_init(oi); - exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, - exofs_oi_objno(oi)); - - /* read the inode from the osd */ - ret = exofs_get_inode(sb, oi, &fcb); - if (ret) - goto bad_inode; - - set_obj_created(oi); - - /* copy stuff from on-disk struct to in-memory struct */ - inode->i_mode = le16_to_cpu(fcb.i_mode); - i_uid_write(inode, le32_to_cpu(fcb.i_uid)); - i_gid_write(inode, le32_to_cpu(fcb.i_gid)); - set_nlink(inode, le16_to_cpu(fcb.i_links_count)); - inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); - inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); - inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); - inode->i_ctime.tv_nsec = - inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; - oi->i_commit_size = le64_to_cpu(fcb.i_size); - i_size_write(inode, oi->i_commit_size); - inode->i_blkbits = EXOFS_BLKSHIFT; - inode->i_generation = le32_to_cpu(fcb.i_generation); - - oi->i_dir_start_lookup = 0; - - if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { - ret = -ESTALE; - goto bad_inode; - } - - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (fcb.i_data[0]) - inode->i_rdev = - old_decode_dev(le32_to_cpu(fcb.i_data[0])); - else - inode->i_rdev = - new_decode_dev(le32_to_cpu(fcb.i_data[1])); - } else { - memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); - } - - if (S_ISREG(inode->i_mode)) { - inode->i_op = &exofs_file_inode_operations; - inode->i_fop = &exofs_file_operations; - inode->i_mapping->a_ops = &exofs_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &exofs_dir_inode_operations; - inode->i_fop = &exofs_dir_operations; - inode->i_mapping->a_ops = &exofs_aops; - } else if (S_ISLNK(inode->i_mode)) { - if (exofs_inode_is_fast_symlink(inode)) { - inode->i_op = &simple_symlink_inode_operations; - inode->i_link = (char *)oi->i_data; - } else { - inode->i_op = &page_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &exofs_aops; - } - } else { - inode->i_op = &exofs_special_inode_operations; - if (fcb.i_data[0]) - init_special_inode(inode, inode->i_mode, - old_decode_dev(le32_to_cpu(fcb.i_data[0]))); - else - init_special_inode(inode, inode->i_mode, - new_decode_dev(le32_to_cpu(fcb.i_data[1]))); - } - - unlock_new_inode(inode); - return inode; - -bad_inode: - iget_failed(inode); - return ERR_PTR(ret); -} - -int __exofs_wait_obj_created(struct exofs_i_info *oi) -{ - if (!obj_created(oi)) { - EXOFS_DBGMSG("!obj_created\n"); - BUG_ON(!obj_2bcreated(oi)); - wait_event(oi->i_wq, obj_created(oi)); - EXOFS_DBGMSG("wait_event done\n"); - } - return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; -} - -/* - * Callback function from exofs_new_inode(). The important thing is that we - * set the obj_created flag so that other methods know that the object exists on - * the OSD. - */ -static void create_done(struct ore_io_state *ios, void *p) -{ - struct inode *inode = p; - struct exofs_i_info *oi = exofs_i(inode); - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; - int ret; - - ret = ore_check_io(ios, NULL); - ore_put_io_state(ios); - - atomic_dec(&sbi->s_curr_pending); - - if (unlikely(ret)) { - EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", - _LLU(exofs_oi_objno(oi)), - _LLU(oi->one_comp.obj.partition)); - /*TODO: When FS is corrupted creation can fail, object already - * exist. Get rid of this asynchronous creation, if exist - * increment the obj counter and try the next object. Until we - * succeed. All these dangling objects will be made into lost - * files by chkfs.exofs - */ - } - - set_obj_created(oi); - - wake_up(&oi->i_wq); -} - -/* - * Set up a new inode and create an object for it on the OSD - */ -struct inode *exofs_new_inode(struct inode *dir, umode_t mode) -{ - struct super_block *sb = dir->i_sb; - struct exofs_sb_info *sbi = sb->s_fs_info; - struct inode *inode; - struct exofs_i_info *oi; - struct ore_io_state *ios; - int ret; - - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - oi = exofs_i(inode); - __oi_init(oi); - - set_obj_2bcreated(oi); - - inode_init_owner(inode, dir, mode); - inode->i_ino = sbi->s_nextid++; - inode->i_blkbits = EXOFS_BLKSHIFT; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - oi->i_commit_size = inode->i_size = 0; - spin_lock(&sbi->s_next_gen_lock); - inode->i_generation = sbi->s_next_generation++; - spin_unlock(&sbi->s_next_gen_lock); - insert_inode_hash(inode); - - exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, - exofs_oi_objno(oi)); - exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ - - mark_inode_dirty(inode); - - ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); - return ERR_PTR(ret); - } - - ios->done = create_done; - ios->private = inode; - - ret = ore_create(ios); - if (ret) { - ore_put_io_state(ios); - return ERR_PTR(ret); - } - atomic_inc(&sbi->s_curr_pending); - - return inode; -} - -/* - * struct to pass two arguments to update_inode's callback - */ -struct updatei_args { - struct exofs_sb_info *sbi; - struct exofs_fcb fcb; -}; - -/* - * Callback function from exofs_update_inode(). - */ -static void updatei_done(struct ore_io_state *ios, void *p) -{ - struct updatei_args *args = p; - - ore_put_io_state(ios); - - atomic_dec(&args->sbi->s_curr_pending); - - kfree(args); -} - -/* - * Write the inode to the OSD. Just fill up the struct, and set the attribute - * synchronously or asynchronously depending on the do_sync flag. - */ -static int exofs_update_inode(struct inode *inode, int do_sync) -{ - struct exofs_i_info *oi = exofs_i(inode); - struct super_block *sb = inode->i_sb; - struct exofs_sb_info *sbi = sb->s_fs_info; - struct ore_io_state *ios; - struct osd_attr attr; - struct exofs_fcb *fcb; - struct updatei_args *args; - int ret; - - args = kzalloc(sizeof(*args), GFP_KERNEL); - if (!args) { - EXOFS_DBGMSG("Failed kzalloc of args\n"); - return -ENOMEM; - } - - fcb = &args->fcb; - - fcb->i_mode = cpu_to_le16(inode->i_mode); - fcb->i_uid = cpu_to_le32(i_uid_read(inode)); - fcb->i_gid = cpu_to_le32(i_gid_read(inode)); - fcb->i_links_count = cpu_to_le16(inode->i_nlink); - fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); - fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); - oi->i_commit_size = i_size_read(inode); - fcb->i_size = cpu_to_le64(oi->i_commit_size); - fcb->i_generation = cpu_to_le32(inode->i_generation); - - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (old_valid_dev(inode->i_rdev)) { - fcb->i_data[0] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - fcb->i_data[1] = 0; - } else { - fcb->i_data[0] = 0; - fcb->i_data[1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - fcb->i_data[2] = 0; - } - } else - memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - - ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); - goto free_args; - } - - attr = g_attr_inode_data; - attr.val_ptr = fcb; - ios->out_attr_len = 1; - ios->out_attr = &attr; - - wait_obj_created(oi); - - if (!do_sync) { - args->sbi = sbi; - ios->done = updatei_done; - ios->private = args; - } - - ret = ore_write(ios); - if (!do_sync && !ret) { - atomic_inc(&sbi->s_curr_pending); - goto out; /* deallocation in updatei_done */ - } - - ore_put_io_state(ios); -free_args: - kfree(args); -out: - EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n", - inode->i_ino, do_sync, ret); - return ret; -} - -int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */ - return exofs_update_inode(inode, 1); -} - -/* - * Callback function from exofs_delete_inode() - don't have much cleaning up to - * do. - */ -static void delete_done(struct ore_io_state *ios, void *p) -{ - struct exofs_sb_info *sbi = p; - - ore_put_io_state(ios); - - atomic_dec(&sbi->s_curr_pending); -} - -/* - * Called when the refcount of an inode reaches zero. We remove the object - * from the OSD here. We make sure the object was created before we try and - * delete it. - */ -void exofs_evict_inode(struct inode *inode) -{ - struct exofs_i_info *oi = exofs_i(inode); - struct super_block *sb = inode->i_sb; - struct exofs_sb_info *sbi = sb->s_fs_info; - struct ore_io_state *ios; - int ret; - - truncate_inode_pages_final(&inode->i_data); - - /* TODO: should do better here */ - if (inode->i_nlink || is_bad_inode(inode)) - goto no_delete; - - inode->i_size = 0; - clear_inode(inode); - - /* if we are deleting an obj that hasn't been created yet, wait. - * This also makes sure that create_done cannot be called with an - * already evicted inode. - */ - wait_obj_created(oi); - /* ignore the error, attempt a remove anyway */ - - /* Now Remove the OSD objects */ - ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); - return; - } - - ios->done = delete_done; - ios->private = sbi; - - ret = ore_remove(ios); - if (ret) { - EXOFS_ERR("%s: ore_remove failed\n", __func__); - ore_put_io_state(ios); - return; - } - atomic_inc(&sbi->s_curr_pending); - - return; - -no_delete: - clear_inode(inode); -} diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c deleted file mode 100644 index 7295cd722770..000000000000 --- a/fs/exofs/namei.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "exofs.h" - -static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode) -{ - int err = exofs_add_link(dentry, inode); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } - inode_dec_link_count(inode); - iput(inode); - return err; -} - -static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct inode *inode; - ino_t ino; - - if (dentry->d_name.len > EXOFS_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - - ino = exofs_inode_by_name(dir, dentry); - inode = ino ? exofs_iget(dir->i_sb, ino) : NULL; - return d_splice_alias(inode, dentry); -} - -static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) -{ - struct inode *inode = exofs_new_inode(dir, mode); - int err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &exofs_file_inode_operations; - inode->i_fop = &exofs_file_operations; - inode->i_mapping->a_ops = &exofs_aops; - mark_inode_dirty(inode); - err = exofs_add_nondir(dentry, inode); - } - return err; -} - -static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) -{ - struct inode *inode; - int err; - - inode = exofs_new_inode(dir, mode); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, inode->i_mode, rdev); - mark_inode_dirty(inode); - err = exofs_add_nondir(dentry, inode); - } - return err; -} - -static int exofs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) -{ - struct super_block *sb = dir->i_sb; - int err = -ENAMETOOLONG; - unsigned l = strlen(symname)+1; - struct inode *inode; - struct exofs_i_info *oi; - - if (l > sb->s_blocksize) - goto out; - - inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out; - - oi = exofs_i(inode); - if (l > sizeof(oi->i_data)) { - /* slow symlink */ - inode->i_op = &page_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &exofs_aops; - memset(oi->i_data, 0, sizeof(oi->i_data)); - - err = page_symlink(inode, symname, l); - if (err) - goto out_fail; - } else { - /* fast symlink */ - inode->i_op = &simple_symlink_inode_operations; - inode->i_link = (char *)oi->i_data; - memcpy(oi->i_data, symname, l); - inode->i_size = l-1; - } - mark_inode_dirty(inode); - - err = exofs_add_nondir(dentry, inode); -out: - return err; - -out_fail: - inode_dec_link_count(inode); - iput(inode); - goto out; -} - -static int exofs_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *dentry) -{ - struct inode *inode = d_inode(old_dentry); - - inode->i_ctime = current_time(inode); - inode_inc_link_count(inode); - ihold(inode); - - return exofs_add_nondir(dentry, inode); -} - -static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - struct inode *inode; - int err; - - inode_inc_link_count(dir); - - inode = exofs_new_inode(dir, S_IFDIR | mode); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_dir; - - inode->i_op = &exofs_dir_inode_operations; - inode->i_fop = &exofs_dir_operations; - inode->i_mapping->a_ops = &exofs_aops; - - inode_inc_link_count(inode); - - err = exofs_make_empty(inode, dir); - if (err) - goto out_fail; - - err = exofs_add_link(dentry, inode); - if (err) - goto out_fail; - - d_instantiate(dentry, inode); -out: - return err; - -out_fail: - inode_dec_link_count(inode); - inode_dec_link_count(inode); - iput(inode); -out_dir: - inode_dec_link_count(dir); - goto out; -} - -static int exofs_unlink(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - struct exofs_dir_entry *de; - struct page *page; - int err = -ENOENT; - - de = exofs_find_entry(dir, dentry, &page); - if (!de) - goto out; - - err = exofs_delete_entry(de, page); - if (err) - goto out; - - inode->i_ctime = dir->i_ctime; - inode_dec_link_count(inode); - err = 0; -out: - return err; -} - -static int exofs_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - int err = -ENOTEMPTY; - - if (exofs_empty_dir(inode)) { - err = exofs_unlink(dir, dentry); - if (!err) { - inode->i_size = 0; - inode_dec_link_count(inode); - inode_dec_link_count(dir); - } - } - return err; -} - -static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) -{ - struct inode *old_inode = d_inode(old_dentry); - struct inode *new_inode = d_inode(new_dentry); - struct page *dir_page = NULL; - struct exofs_dir_entry *dir_de = NULL; - struct page *old_page; - struct exofs_dir_entry *old_de; - int err = -ENOENT; - - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - - old_de = exofs_find_entry(old_dir, old_dentry, &old_page); - if (!old_de) - goto out; - - if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; - dir_de = exofs_dotdot(old_inode, &dir_page); - if (!dir_de) - goto out_old; - } - - if (new_inode) { - struct page *new_page; - struct exofs_dir_entry *new_de; - - err = -ENOTEMPTY; - if (dir_de && !exofs_empty_dir(new_inode)) - goto out_dir; - - err = -ENOENT; - new_de = exofs_find_entry(new_dir, new_dentry, &new_page); - if (!new_de) - goto out_dir; - err = exofs_set_link(new_dir, new_de, new_page, old_inode); - new_inode->i_ctime = current_time(new_inode); - if (dir_de) - drop_nlink(new_inode); - inode_dec_link_count(new_inode); - if (err) - goto out_dir; - } else { - err = exofs_add_link(new_dentry, old_inode); - if (err) - goto out_dir; - if (dir_de) - inode_inc_link_count(new_dir); - } - - old_inode->i_ctime = current_time(old_inode); - - exofs_delete_entry(old_de, old_page); - mark_inode_dirty(old_inode); - - if (dir_de) { - err = exofs_set_link(old_inode, dir_de, dir_page, new_dir); - inode_dec_link_count(old_dir); - if (err) - goto out_dir; - } - return 0; - - -out_dir: - if (dir_de) { - kunmap(dir_page); - put_page(dir_page); - } -out_old: - kunmap(old_page); - put_page(old_page); -out: - return err; -} - -const struct inode_operations exofs_dir_inode_operations = { - .create = exofs_create, - .lookup = exofs_lookup, - .link = exofs_link, - .unlink = exofs_unlink, - .symlink = exofs_symlink, - .mkdir = exofs_mkdir, - .rmdir = exofs_rmdir, - .mknod = exofs_mknod, - .rename = exofs_rename, - .setattr = exofs_setattr, -}; - -const struct inode_operations exofs_special_inode_operations = { - .setattr = exofs_setattr, -}; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c deleted file mode 100644 index 24a8e34882e9..000000000000 --- a/fs/exofs/ore.c +++ /dev/null @@ -1,1179 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/slab.h> -#include <linux/module.h> -#include <asm/div64.h> -#include <linux/lcm.h> - -#include "ore_raid.h" - -MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>"); -MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); -MODULE_LICENSE("GPL"); - -/* ore_verify_layout does a couple of things: - * 1. Given a minimum number of needed parameters fixes up the rest of the - * members to be operatonals for the ore. The needed parameters are those - * that are defined by the pnfs-objects layout STD. - * 2. Check to see if the current ore code actually supports these parameters - * for example stripe_unit must be a multple of the system PAGE_SIZE, - * and etc... - * 3. Cache some havily used calculations that will be needed by users. - */ - -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; - -int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) -{ - u64 stripe_length; - - switch (layout->raid_algorithm) { - case PNFS_OSD_RAID_0: - layout->parity = 0; - break; - case PNFS_OSD_RAID_5: - layout->parity = 1; - break; - case PNFS_OSD_RAID_PQ: - layout->parity = 2; - break; - case PNFS_OSD_RAID_4: - default: - ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n", - layout->raid_algorithm); - return -EINVAL; - } - if (0 != (layout->stripe_unit & ~PAGE_MASK)) { - ORE_ERR("Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(layout->stripe_unit), PAGE_SIZE); - return -EINVAL; - } - if (layout->group_width) { - if (!layout->group_depth) { - ORE_ERR("group_depth == 0 && group_width != 0\n"); - return -EINVAL; - } - if (total_comps < (layout->group_width * layout->mirrors_p1)) { - ORE_ERR("Data Map wrong, " - "numdevs=%d < group_width=%d * mirrors=%d\n", - total_comps, layout->group_width, - layout->mirrors_p1); - return -EINVAL; - } - layout->group_count = total_comps / layout->mirrors_p1 / - layout->group_width; - } else { - if (layout->group_depth) { - printk(KERN_NOTICE "Warning: group_depth ignored " - "group_width == 0 && group_depth == %lld\n", - _LLU(layout->group_depth)); - } - layout->group_width = total_comps / layout->mirrors_p1; - layout->group_depth = -1; - layout->group_count = 1; - } - - stripe_length = (u64)layout->group_width * layout->stripe_unit; - if (stripe_length >= (1ULL << 32)) { - ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", - _LLU(stripe_length)); - return -EINVAL; - } - - layout->max_io_length = - (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * - (layout->group_width - layout->parity); - if (layout->parity) { - unsigned stripe_length = - (layout->group_width - layout->parity) * - layout->stripe_unit; - - layout->max_io_length /= stripe_length; - layout->max_io_length *= stripe_length; - } - ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length); - - return 0; -} -EXPORT_SYMBOL(ore_verify_layout); - -static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) -{ - return ios->oc->comps[index & ios->oc->single_comp].cred; -} - -static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) -{ - return &ios->oc->comps[index & ios->oc->single_comp].obj; -} - -static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) -{ - ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", - ios->oc->first_dev, ios->oc->numdevs, index, - ios->oc->ods); - - return ore_comp_dev(ios->oc, index); -} - -int _ore_get_io_state(struct ore_layout *layout, - struct ore_components *oc, unsigned numdevs, - unsigned sgs_per_dev, unsigned num_par_pages, - struct ore_io_state **pios) -{ - struct ore_io_state *ios; - size_t size_ios, size_extra, size_total; - void *ios_extra; - - /* - * The desired layout looks like this, with the extra_allocation - * items pointed at from fields within ios or per_dev: - - struct __alloc_all_io_state { - struct ore_io_state ios; - struct ore_per_dev_state per_dev[numdevs]; - union { - struct osd_sg_entry sglist[sgs_per_dev * numdevs]; - struct page *pages[num_par_pages]; - } extra_allocation; - } whole_allocation; - - */ - - /* This should never happen, so abort early if it ever does. */ - if (sgs_per_dev && num_par_pages) { - ORE_DBGMSG("Tried to use both pages and sglist\n"); - *pios = NULL; - return -EINVAL; - } - - if (numdevs > (INT_MAX - sizeof(*ios)) / - sizeof(struct ore_per_dev_state)) - return -ENOMEM; - size_ios = sizeof(*ios) + sizeof(struct ore_per_dev_state) * numdevs; - - if (sgs_per_dev * numdevs > INT_MAX / sizeof(struct osd_sg_entry)) - return -ENOMEM; - if (num_par_pages > INT_MAX / sizeof(struct page *)) - return -ENOMEM; - size_extra = max(sizeof(struct osd_sg_entry) * (sgs_per_dev * numdevs), - sizeof(struct page *) * num_par_pages); - - size_total = size_ios + size_extra; - - if (likely(size_total <= PAGE_SIZE)) { - ios = kzalloc(size_total, GFP_KERNEL); - if (unlikely(!ios)) { - ORE_DBGMSG("Failed kzalloc bytes=%zd\n", size_total); - *pios = NULL; - return -ENOMEM; - } - ios_extra = (char *)ios + size_ios; - } else { - ios = kzalloc(size_ios, GFP_KERNEL); - if (unlikely(!ios)) { - ORE_DBGMSG("Failed alloc first part bytes=%zd\n", - size_ios); - *pios = NULL; - return -ENOMEM; - } - ios_extra = kzalloc(size_extra, GFP_KERNEL); - if (unlikely(!ios_extra)) { - ORE_DBGMSG("Failed alloc second part bytes=%zd\n", - size_extra); - kfree(ios); - *pios = NULL; - return -ENOMEM; - } - - /* In this case the per_dev[0].sgilist holds the pointer to - * be freed - */ - ios->extra_part_alloc = true; - } - - if (num_par_pages) { - ios->parity_pages = ios_extra; - ios->max_par_pages = num_par_pages; - } - if (sgs_per_dev) { - struct osd_sg_entry *sgilist = ios_extra; - unsigned d; - - for (d = 0; d < numdevs; ++d) { - ios->per_dev[d].sglist = sgilist; - sgilist += sgs_per_dev; - } - ios->sgs_per_dev = sgs_per_dev; - } - - ios->layout = layout; - ios->oc = oc; - *pios = ios; - return 0; -} - -/* Allocate an io_state for only a single group of devices - * - * If a user needs to call ore_read/write() this version must be used becase it - * allocates extra stuff for striping and raid. - * The ore might decide to only IO less then @length bytes do to alignmets - * and constrains as follows: - * - The IO cannot cross group boundary. - * - In raid5/6 The end of the IO must align at end of a stripe eg. - * (@offset + @length) % strip_size == 0. Or the complete range is within a - * single stripe. - * - Memory condition only permitted a shorter IO. (A user can use @length=~0 - * And check the returned ios->length for max_io_size.) - * - * The caller must check returned ios->length (and/or ios->nr_pages) and - * re-issue these pages that fall outside of ios->length - */ -int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, - bool is_reading, u64 offset, u64 length, - struct ore_io_state **pios) -{ - struct ore_io_state *ios; - unsigned numdevs = layout->group_width * layout->mirrors_p1; - unsigned sgs_per_dev = 0, max_par_pages = 0; - int ret; - - if (layout->parity && length) { - unsigned data_devs = layout->group_width - layout->parity; - unsigned stripe_size = layout->stripe_unit * data_devs; - unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; - u32 remainder; - u64 num_stripes; - u64 num_raid_units; - - num_stripes = div_u64_rem(length, stripe_size, &remainder); - if (remainder) - ++num_stripes; - - num_raid_units = num_stripes * layout->parity; - - if (is_reading) { - /* For reads add per_dev sglist array */ - /* TODO: Raid 6 we need twice more. Actually: - * num_stripes / LCMdP(W,P); - * if (W%P != 0) num_stripes *= parity; - */ - - /* first/last seg is split */ - num_raid_units += layout->group_width; - sgs_per_dev = div_u64(num_raid_units, data_devs) + 2; - } else { - /* For Writes add parity pages array. */ - max_par_pages = num_raid_units * pages_in_unit * - sizeof(struct page *); - } - } - - ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, - pios); - if (unlikely(ret)) - return ret; - - ios = *pios; - ios->reading = is_reading; - ios->offset = offset; - - if (length) { - ore_calc_stripe_info(layout, offset, length, &ios->si); - ios->length = ios->si.length; - ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) + - ios->length + PAGE_SIZE - 1) / PAGE_SIZE; - if (layout->parity) - _ore_post_alloc_raid_stuff(ios); - } - - return 0; -} -EXPORT_SYMBOL(ore_get_rw_state); - -/* Allocate an io_state for all the devices in the comps array - * - * This version of io_state allocation is used mostly by create/remove - * and trunc where we currently need all the devices. The only wastful - * bit is the read/write_attributes with no IO. Those sites should - * be converted to use ore_get_rw_state() with length=0 - */ -int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, - struct ore_io_state **pios) -{ - return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); -} -EXPORT_SYMBOL(ore_get_io_state); - -void ore_put_io_state(struct ore_io_state *ios) -{ - if (ios) { - unsigned i; - - for (i = 0; i < ios->numdevs; i++) { - struct ore_per_dev_state *per_dev = &ios->per_dev[i]; - - if (per_dev->or) - osd_end_request(per_dev->or); - if (per_dev->bio) - bio_put(per_dev->bio); - } - - _ore_free_raid_stuff(ios); - kfree(ios); - } -} -EXPORT_SYMBOL(ore_put_io_state); - -static void _sync_done(struct ore_io_state *ios, void *p) -{ - struct completion *waiting = p; - - complete(waiting); -} - -static void _last_io(struct kref *kref) -{ - struct ore_io_state *ios = container_of( - kref, struct ore_io_state, kref); - - ios->done(ios, ios->private); -} - -static void _done_io(struct osd_request *or, void *p) -{ - struct ore_io_state *ios = p; - - kref_put(&ios->kref, _last_io); -} - -int ore_io_execute(struct ore_io_state *ios) -{ - DECLARE_COMPLETION_ONSTACK(wait); - bool sync = (ios->done == NULL); - int i, ret; - - if (sync) { - ios->done = _sync_done; - ios->private = &wait; - } - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - if (unlikely(!or)) - continue; - - ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL); - if (unlikely(ret)) { - ORE_DBGMSG("Failed to osd_finalize_request() => %d\n", - ret); - return ret; - } - } - - kref_init(&ios->kref); - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - if (unlikely(!or)) - continue; - - kref_get(&ios->kref); - osd_execute_request_async(or, _done_io, ios); - } - - kref_put(&ios->kref, _last_io); - ret = 0; - - if (sync) { - wait_for_completion(&wait); - ret = ore_check_io(ios, NULL); - } - return ret; -} - -static void _clear_bio(struct bio *bio) -{ - struct bio_vec *bv; - unsigned i; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bv, bio, i, iter_all) { - unsigned this_count = bv->bv_len; - - if (likely(PAGE_SIZE == this_count)) - clear_highpage(bv->bv_page); - else - zero_user(bv->bv_page, bv->bv_offset, this_count); - } -} - -int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) -{ - enum osd_err_priority acumulated_osd_err = 0; - int acumulated_lin_err = 0; - int i; - - for (i = 0; i < ios->numdevs; i++) { - struct osd_sense_info osi; - struct ore_per_dev_state *per_dev = &ios->per_dev[i]; - struct osd_request *or = per_dev->or; - int ret; - - if (unlikely(!or)) - continue; - - ret = osd_req_decode_sense(or, &osi); - if (likely(!ret)) - continue; - - if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) && - per_dev->bio) { - /* start read offset passed endof file. - * Note: if we do not have bio it means read-attributes - * In this case we should return error to caller. - */ - _clear_bio(per_dev->bio); - ORE_DBGMSG("start read offset passed end of file " - "offset=0x%llx, length=0x%llx\n", - _LLU(per_dev->offset), - _LLU(per_dev->length)); - - continue; /* we recovered */ - } - - if (on_dev_error) { - u64 residual = ios->reading ? - or->in.residual : or->out.residual; - u64 offset = (ios->offset + ios->length) - residual; - unsigned dev = per_dev->dev - ios->oc->first_dev; - struct ore_dev *od = ios->oc->ods[dev]; - - on_dev_error(ios, od, dev, osi.osd_err_pri, - offset, residual); - } - if (osi.osd_err_pri >= acumulated_osd_err) { - acumulated_osd_err = osi.osd_err_pri; - acumulated_lin_err = ret; - } - } - - return acumulated_lin_err; -} -EXPORT_SYMBOL(ore_check_io); - -/* - * L - logical offset into the file - * - * D - number of Data devices - * D = group_width - parity - * - * U - The number of bytes in a stripe within a group - * U = stripe_unit * D - * - * T - The number of bytes striped within a group of component objects - * (before advancing to the next group) - * T = U * group_depth - * - * S - The number of bytes striped across all component objects - * before the pattern repeats - * S = T * group_count - * - * M - The "major" (i.e., across all components) cycle number - * M = L / S - * - * G - Counts the groups from the beginning of the major cycle - * G = (L - (M * S)) / T [or (L % S) / T] - * - * H - The byte offset within the group - * H = (L - (M * S)) % T [or (L % S) % T] - * - * N - The "minor" (i.e., across the group) stripe number - * N = H / U - * - * C - The component index coresponding to L - * - * C = (H - (N * U)) / stripe_unit + G * D - * [or (L % U) / stripe_unit + G * D] - * - * O - The component offset coresponding to L - * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit - * - * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity - * divide by parity - * LCMdP = lcm(group_width, parity) / parity - * - * R - The parity Rotation stripe - * (Note parity cycle always starts at a group's boundary) - * R = N % LCMdP - * - * I = the first parity device index - * I = (group_width + group_width - R*parity - parity) % group_width - * - * Craid - The component index Rotated - * Craid = (group_width + C - R*parity) % group_width - * (We add the group_width to avoid negative numbers modulo math) - */ -void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, - u64 length, struct ore_striping_info *si) -{ - u32 stripe_unit = layout->stripe_unit; - u32 group_width = layout->group_width; - u64 group_depth = layout->group_depth; - u32 parity = layout->parity; - - u32 D = group_width - parity; - u32 U = D * stripe_unit; - u64 T = U * group_depth; - u64 S = T * layout->group_count; - u64 M = div64_u64(file_offset, S); - - /* - G = (L - (M * S)) / T - H = (L - (M * S)) % T - */ - u64 LmodS = file_offset - M * S; - u32 G = div64_u64(LmodS, T); - u64 H = LmodS - G * T; - - u32 N = div_u64(H, U); - u32 Nlast; - - /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; - u32 first_dev = C - C % group_width; - - div_u64_rem(file_offset, stripe_unit, &si->unit_off); - - si->obj_offset = si->unit_off + (N * stripe_unit) + - (M * group_depth * stripe_unit); - si->cur_comp = C - first_dev; - si->cur_pg = si->unit_off / PAGE_SIZE; - - if (parity) { - u32 LCMdP = lcm(group_width, parity) / parity; - /* R = N % LCMdP; */ - u32 RxP = (N % LCMdP) * parity; - - si->par_dev = (group_width + group_width - parity - RxP) % - group_width + first_dev; - si->dev = (group_width + group_width + C - RxP) % - group_width + first_dev; - si->bytes_in_stripe = U; - si->first_stripe_start = M * S + G * T + N * U; - } else { - /* Make the math correct see _prepare_one_group */ - si->par_dev = group_width; - si->dev = C; - } - - si->dev *= layout->mirrors_p1; - si->par_dev *= layout->mirrors_p1; - si->offset = file_offset; - si->length = T - H; - if (si->length > length) - si->length = length; - - Nlast = div_u64(H + si->length + U - 1, U); - si->maxdevUnits = Nlast - N; - - si->M = M; -} -EXPORT_SYMBOL(ore_calc_stripe_info); - -int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct page **pages, - struct ore_per_dev_state *per_dev, int cur_len) -{ - unsigned pg = *cur_pg; - struct request_queue *q = - osd_request_queue(_ios_od(ios, per_dev->dev)); - unsigned len = cur_len; - int ret; - - if (per_dev->bio == NULL) { - unsigned bio_size; - - if (!ios->reading) { - bio_size = ios->si.maxdevUnits; - } else { - bio_size = (ios->si.maxdevUnits + 1) * - (ios->layout->group_width - ios->layout->parity) / - ios->layout->group_width; - } - bio_size *= (ios->layout->stripe_unit / PAGE_SIZE); - - per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); - if (unlikely(!per_dev->bio)) { - ORE_DBGMSG("Failed to allocate BIO size=%u\n", - bio_size); - ret = -ENOMEM; - goto out; - } - } - - while (cur_len > 0) { - unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); - unsigned added_len; - - cur_len -= pglen; - - added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], - pglen, pgbase); - if (unlikely(pglen != added_len)) { - /* If bi_vcnt == bi_max then this is a SW BUG */ - ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x " - "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n", - per_dev->bio->bi_vcnt, - per_dev->bio->bi_max_vecs, - BIO_MAX_PAGES_KMALLOC, cur_len); - ret = -ENOMEM; - goto out; - } - _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); - - pgbase = 0; - ++pg; - } - BUG_ON(cur_len); - - per_dev->length += len; - *cur_pg = pg; - ret = 0; -out: /* we fail the complete unit on an error eg don't advance - * per_dev->length and cur_pg. This means that we might have a bigger - * bio than the CDB requested length (per_dev->length). That's fine - * only the oposite is fatal. - */ - return ret; -} - -static int _add_parity_units(struct ore_io_state *ios, - struct ore_striping_info *si, - unsigned dev, unsigned first_dev, - unsigned mirrors_p1, unsigned devs_in_group, - unsigned cur_len) -{ - unsigned do_parity; - int ret = 0; - - for (do_parity = ios->layout->parity; do_parity; --do_parity) { - struct ore_per_dev_state *per_dev; - - per_dev = &ios->per_dev[dev - first_dev]; - if (!per_dev->length && !per_dev->offset) { - /* Only/always the parity unit of the first - * stripe will be empty. So this is a chance to - * initialize the per_dev info. - */ - per_dev->dev = dev; - per_dev->offset = si->obj_offset - si->unit_off; - } - - ret = _ore_add_parity_unit(ios, si, per_dev, cur_len, - do_parity == 1); - if (unlikely(ret)) - break; - - if (do_parity != 1) { - dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; - si->cur_comp = (si->cur_comp + 1) % - ios->layout->group_width; - } - } - - return ret; -} - -static int _prepare_for_striping(struct ore_io_state *ios) -{ - struct ore_striping_info *si = &ios->si; - unsigned stripe_unit = ios->layout->stripe_unit; - unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned group_width = ios->layout->group_width; - unsigned devs_in_group = group_width * mirrors_p1; - unsigned dev = si->dev; - unsigned first_dev = dev - (dev % devs_in_group); - unsigned cur_pg = ios->pages_consumed; - u64 length = ios->length; - int ret = 0; - - if (!ios->pages) { - ios->numdevs = ios->layout->mirrors_p1; - return 0; - } - - BUG_ON(length > si->length); - - while (length) { - struct ore_per_dev_state *per_dev = - &ios->per_dev[dev - first_dev]; - unsigned cur_len, page_off = 0; - - if (!per_dev->length && !per_dev->offset) { - /* First time initialize the per_dev info. */ - per_dev->dev = dev; - if (dev == si->dev) { - WARN_ON(dev == si->par_dev); - per_dev->offset = si->obj_offset; - cur_len = stripe_unit - si->unit_off; - page_off = si->unit_off & ~PAGE_MASK; - BUG_ON(page_off && (page_off != ios->pgbase)); - } else { - per_dev->offset = si->obj_offset - si->unit_off; - cur_len = stripe_unit; - } - } else { - cur_len = stripe_unit; - } - if (cur_len >= length) - cur_len = length; - - ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, - per_dev, cur_len); - if (unlikely(ret)) - goto out; - - length -= cur_len; - - dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; - si->cur_comp = (si->cur_comp + 1) % group_width; - if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { - if (!length && ios->sp2d) { - /* If we are writing and this is the very last - * stripe. then operate on parity dev. - */ - dev = si->par_dev; - /* If last stripe operate on parity comp */ - si->cur_comp = group_width - ios->layout->parity; - } - - /* In writes cur_len just means if it's the - * last one. See _ore_add_parity_unit. - */ - ret = _add_parity_units(ios, si, dev, first_dev, - mirrors_p1, devs_in_group, - ios->sp2d ? length : cur_len); - if (unlikely(ret)) - goto out; - - /* Rotate next par_dev backwards with wraping */ - si->par_dev = (devs_in_group + si->par_dev - - ios->layout->parity * mirrors_p1) % - devs_in_group + first_dev; - /* Next stripe, start fresh */ - si->cur_comp = 0; - si->cur_pg = 0; - si->obj_offset += cur_len; - si->unit_off = 0; - } - } -out: - ios->numdevs = devs_in_group; - ios->pages_consumed = cur_pg; - return ret; -} - -int ore_create(struct ore_io_state *ios) -{ - int i, ret; - - for (i = 0; i < ios->oc->numdevs; i++) { - struct osd_request *or; - - or = osd_start_request(_ios_od(ios, i)); - if (unlikely(!or)) { - ORE_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; - - osd_req_create_object(or, _ios_obj(ios, i)); - } - ret = ore_io_execute(ios); - -out: - return ret; -} -EXPORT_SYMBOL(ore_create); - -int ore_remove(struct ore_io_state *ios) -{ - int i, ret; - - for (i = 0; i < ios->oc->numdevs; i++) { - struct osd_request *or; - - or = osd_start_request(_ios_od(ios, i)); - if (unlikely(!or)) { - ORE_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; - - osd_req_remove_object(or, _ios_obj(ios, i)); - } - ret = ore_io_execute(ios); - -out: - return ret; -} -EXPORT_SYMBOL(ore_remove); - -static int _write_mirror(struct ore_io_state *ios, int cur_comp) -{ - struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp]; - unsigned dev = ios->per_dev[cur_comp].dev; - unsigned last_comp = cur_comp + ios->layout->mirrors_p1; - int ret = 0; - - if (ios->pages && !master_dev->length) - return 0; /* Just an empty slot */ - - for (; cur_comp < last_comp; ++cur_comp, ++dev) { - struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - struct osd_request *or; - - or = osd_start_request(_ios_od(ios, dev)); - if (unlikely(!or)) { - ORE_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - per_dev->or = or; - - if (ios->pages) { - struct bio *bio; - - if (per_dev != master_dev) { - bio = bio_clone_fast(master_dev->bio, - GFP_KERNEL, NULL); - if (unlikely(!bio)) { - ORE_DBGMSG( - "Failed to allocate BIO size=%u\n", - master_dev->bio->bi_max_vecs); - ret = -ENOMEM; - goto out; - } - - bio->bi_disk = NULL; - bio->bi_next = NULL; - per_dev->offset = master_dev->offset; - per_dev->length = master_dev->length; - per_dev->bio = bio; - per_dev->dev = dev; - } else { - bio = master_dev->bio; - /* FIXME: bio_set_dir() */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - } - - osd_req_write(or, _ios_obj(ios, cur_comp), - per_dev->offset, bio, per_dev->length); - ORE_DBGMSG("write(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d\n", - _LLU(_ios_obj(ios, cur_comp)->id), - _LLU(per_dev->offset), - _LLU(per_dev->length), dev); - } else if (ios->kern_buff) { - per_dev->offset = ios->si.obj_offset; - per_dev->dev = ios->si.dev + dev; - - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (ios->si.unit_off + ios->length > - ios->layout->stripe_unit)); - - ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp), - per_dev->offset, - ios->kern_buff, ios->length); - if (unlikely(ret)) - goto out; - ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d\n", - _LLU(_ios_obj(ios, cur_comp)->id), - _LLU(per_dev->offset), - _LLU(ios->length), per_dev->dev); - } else { - osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); - ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", - _LLU(_ios_obj(ios, cur_comp)->id), - ios->out_attr_len, dev); - } - - if (ios->out_attr) - osd_req_add_set_attr_list(or, ios->out_attr, - ios->out_attr_len); - - if (ios->in_attr) - osd_req_add_get_attr_list(or, ios->in_attr, - ios->in_attr_len); - } - -out: - return ret; -} - -int ore_write(struct ore_io_state *ios) -{ - int i; - int ret; - - if (unlikely(ios->sp2d && !ios->r4w)) { - /* A library is attempting a RAID-write without providing - * a pages lock interface. - */ - WARN_ON_ONCE(1); - return -ENOTSUPP; - } - - ret = _prepare_for_striping(ios); - if (unlikely(ret)) - return ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _write_mirror(ios, i); - if (unlikely(ret)) - return ret; - } - - ret = ore_io_execute(ios); - return ret; -} -EXPORT_SYMBOL(ore_write); - -int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) -{ - struct osd_request *or; - struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - struct osd_obj_id *obj = _ios_obj(ios, cur_comp); - unsigned first_dev = (unsigned)obj->id; - - if (ios->pages && !per_dev->length) - return 0; /* Just an empty slot */ - - first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; - or = osd_start_request(_ios_od(ios, first_dev)); - if (unlikely(!or)) { - ORE_ERR("%s: osd_start_request failed\n", __func__); - return -ENOMEM; - } - per_dev->or = or; - - if (ios->pages) { - if (per_dev->cur_sg) { - /* finalize the last sg_entry */ - _ore_add_sg_seg(per_dev, 0, false); - if (unlikely(!per_dev->cur_sg)) - return 0; /* Skip parity only device */ - - osd_req_read_sg(or, obj, per_dev->bio, - per_dev->sglist, per_dev->cur_sg); - } else { - /* The no raid case */ - osd_req_read(or, obj, per_dev->offset, - per_dev->bio, per_dev->length); - } - - ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d sg_len=%d\n", _LLU(obj->id), - _LLU(per_dev->offset), _LLU(per_dev->length), - first_dev, per_dev->cur_sg); - } else { - BUG_ON(ios->kern_buff); - - osd_req_get_attributes(or, obj); - ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", - _LLU(obj->id), - ios->in_attr_len, first_dev); - } - if (ios->out_attr) - osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); - - if (ios->in_attr) - osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); - - return 0; -} - -int ore_read(struct ore_io_state *ios) -{ - int i; - int ret; - - ret = _prepare_for_striping(ios); - if (unlikely(ret)) - return ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _ore_read_mirror(ios, i); - if (unlikely(ret)) - return ret; - } - - ret = ore_io_execute(ios); - return ret; -} -EXPORT_SYMBOL(ore_read); - -int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr) -{ - struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ - void *iter = NULL; - int nelem; - - do { - nelem = 1; - osd_req_decode_get_attr_list(ios->per_dev[0].or, - &cur_attr, &nelem, &iter); - if ((cur_attr.attr_page == attr->attr_page) && - (cur_attr.attr_id == attr->attr_id)) { - attr->len = cur_attr.len; - attr->val_ptr = cur_attr.val_ptr; - return 0; - } - } while (iter); - - return -EIO; -} -EXPORT_SYMBOL(extract_attr_from_ios); - -static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, - struct osd_attr *attr) -{ - int last_comp = cur_comp + ios->layout->mirrors_p1; - - for (; cur_comp < last_comp; ++cur_comp) { - struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - struct osd_request *or; - - or = osd_start_request(_ios_od(ios, cur_comp)); - if (unlikely(!or)) { - ORE_ERR("%s: osd_start_request failed\n", __func__); - return -ENOMEM; - } - per_dev->or = or; - - osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); - osd_req_add_set_attr_list(or, attr, 1); - } - - return 0; -} - -struct _trunc_info { - struct ore_striping_info si; - u64 prev_group_obj_off; - u64 next_group_obj_off; - - unsigned first_group_dev; - unsigned nex_group_dev; -}; - -static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, - struct _trunc_info *ti) -{ - unsigned stripe_unit = layout->stripe_unit; - - ore_calc_stripe_info(layout, file_offset, 0, &ti->si); - - ti->prev_group_obj_off = ti->si.M * stripe_unit; - ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; - - ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); - ti->nex_group_dev = ti->first_group_dev + layout->group_width; -} - -int ore_truncate(struct ore_layout *layout, struct ore_components *oc, - u64 size) -{ - struct ore_io_state *ios; - struct exofs_trunc_attr { - struct osd_attr attr; - __be64 newsize; - } *size_attrs; - struct _trunc_info ti; - int i, ret; - - ret = ore_get_io_state(layout, oc, &ios); - if (unlikely(ret)) - return ret; - - _calc_trunk_info(ios->layout, size, &ti); - - size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), - GFP_KERNEL); - if (unlikely(!size_attrs)) { - ret = -ENOMEM; - goto out; - } - - ios->numdevs = ios->oc->numdevs; - - for (i = 0; i < ios->numdevs; ++i) { - struct exofs_trunc_attr *size_attr = &size_attrs[i]; - u64 obj_size; - - if (i < ti.first_group_dev) - obj_size = ti.prev_group_obj_off; - else if (i >= ti.nex_group_dev) - obj_size = ti.next_group_obj_off; - else if (i < ti.si.dev) /* dev within this group */ - obj_size = ti.si.obj_offset + - ios->layout->stripe_unit - ti.si.unit_off; - else if (i == ti.si.dev) - obj_size = ti.si.obj_offset; - else /* i > ti.dev */ - obj_size = ti.si.obj_offset - ti.si.unit_off; - - size_attr->newsize = cpu_to_be64(obj_size); - size_attr->attr = g_attr_logical_length; - size_attr->attr.val_ptr = &size_attr->newsize; - - ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", - _LLU(oc->comps->obj.id), _LLU(obj_size), i); - ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, - &size_attr->attr); - if (unlikely(ret)) - goto out; - } - ret = ore_io_execute(ios); - -out: - kfree(size_attrs); - ore_put_io_state(ios); - return ret; -} -EXPORT_SYMBOL(ore_truncate); - -const struct osd_attr g_attr_logical_length = ATTR_DEF( - OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); -EXPORT_SYMBOL(g_attr_logical_length); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c deleted file mode 100644 index e83bab54b03e..000000000000 --- a/fs/exofs/ore_raid.c +++ /dev/null @@ -1,757 +0,0 @@ -/* - * Copyright (C) 2011 - * Boaz Harrosh <ooo@electrozaur.com> - * - * This file is part of the objects raid engine (ore). - * - * It is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * You should have received a copy of the GNU General Public License - * along with "ore". If not, write to the Free Software Foundation, Inc: - * "Free Software Foundation <info@fsf.org>" - */ - -#include <linux/gfp.h> -#include <linux/async_tx.h> - -#include "ore_raid.h" - -#undef ORE_DBGMSG2 -#define ORE_DBGMSG2 ORE_DBGMSG - -static struct page *_raid_page_alloc(void) -{ - return alloc_page(GFP_KERNEL); -} - -static void _raid_page_free(struct page *p) -{ - __free_page(p); -} - -/* This struct is forward declare in ore_io_state, but is private to here. - * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. - * - * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. - * Ascending page index access is sp2d(p-minor, c-major). But storage is - * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor - * API. - */ -struct __stripe_pages_2d { - /* Cache some hot path repeated calculations */ - unsigned parity; - unsigned data_devs; - unsigned pages_in_unit; - - bool needed ; - - /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ - struct __1_page_stripe { - bool alloc; - unsigned write_count; - struct async_submit_ctl submit; - struct dma_async_tx_descriptor *tx; - - /* The size of this array is data_devs + parity */ - struct page **pages; - struct page **scribble; - /* bool array, size of this array is data_devs */ - char *page_is_read; - } _1p_stripes[]; -}; - -/* This can get bigger then a page. So support multiple page allocations - * _sp2d_free should be called even if _sp2d_alloc fails (by returning - * none-zero). - */ -static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, - unsigned parity, struct __stripe_pages_2d **psp2d) -{ - struct __stripe_pages_2d *sp2d; - unsigned data_devs = group_width - parity; - - /* - * Desired allocation layout is, though when larger than PAGE_SIZE, - * each struct __alloc_1p_arrays is separately allocated: - - struct _alloc_all_bytes { - struct __alloc_stripe_pages_2d { - struct __stripe_pages_2d sp2d; - struct __1_page_stripe _1p_stripes[pages_in_unit]; - } __asp2d; - struct __alloc_1p_arrays { - struct page *pages[group_width]; - struct page *scribble[group_width]; - char page_is_read[data_devs]; - } __a1pa[pages_in_unit]; - } *_aab; - - struct __alloc_1p_arrays *__a1pa; - struct __alloc_1p_arrays *__a1pa_end; - - */ - - char *__a1pa; - char *__a1pa_end; - - const size_t sizeof_stripe_pages_2d = - sizeof(struct __stripe_pages_2d) + - sizeof(struct __1_page_stripe) * pages_in_unit; - const size_t sizeof__a1pa = - ALIGN(sizeof(struct page *) * (2 * group_width) + data_devs, - sizeof(void *)); - const size_t sizeof__a1pa_arrays = sizeof__a1pa * pages_in_unit; - const size_t alloc_total = sizeof_stripe_pages_2d + - sizeof__a1pa_arrays; - - unsigned num_a1pa, alloc_size, i; - - /* FIXME: check these numbers in ore_verify_layout */ - BUG_ON(sizeof_stripe_pages_2d > PAGE_SIZE); - BUG_ON(sizeof__a1pa > PAGE_SIZE); - - /* - * If alloc_total would be larger than PAGE_SIZE, only allocate - * as many a1pa items as would fill the rest of the page, instead - * of the full pages_in_unit count. - */ - if (alloc_total > PAGE_SIZE) { - num_a1pa = (PAGE_SIZE - sizeof_stripe_pages_2d) / sizeof__a1pa; - alloc_size = sizeof_stripe_pages_2d + sizeof__a1pa * num_a1pa; - } else { - num_a1pa = pages_in_unit; - alloc_size = alloc_total; - } - - *psp2d = sp2d = kzalloc(alloc_size, GFP_KERNEL); - if (unlikely(!sp2d)) { - ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); - return -ENOMEM; - } - /* From here Just call _sp2d_free */ - - /* Find start of a1pa area. */ - __a1pa = (char *)sp2d + sizeof_stripe_pages_2d; - /* Find end of the _allocated_ a1pa area. */ - __a1pa_end = __a1pa + alloc_size; - - /* Allocate additionally needed a1pa items in PAGE_SIZE chunks. */ - for (i = 0; i < pages_in_unit; ++i) { - struct __1_page_stripe *stripe = &sp2d->_1p_stripes[i]; - - if (unlikely(__a1pa >= __a1pa_end)) { - num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, - pages_in_unit - i); - alloc_size = sizeof__a1pa * num_a1pa; - - __a1pa = kzalloc(alloc_size, GFP_KERNEL); - if (unlikely(!__a1pa)) { - ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", - num_a1pa); - return -ENOMEM; - } - __a1pa_end = __a1pa + alloc_size; - /* First *pages is marked for kfree of the buffer */ - stripe->alloc = true; - } - - /* - * Attach all _lp_stripes pointers to the allocation for - * it which was either part of the original PAGE_SIZE - * allocation or the subsequent allocation in this loop. - */ - stripe->pages = (void *)__a1pa; - stripe->scribble = stripe->pages + group_width; - stripe->page_is_read = (char *)stripe->scribble + group_width; - __a1pa += sizeof__a1pa; - } - - sp2d->parity = parity; - sp2d->data_devs = data_devs; - sp2d->pages_in_unit = pages_in_unit; - return 0; -} - -static void _sp2d_reset(struct __stripe_pages_2d *sp2d, - const struct _ore_r4w_op *r4w, void *priv) -{ - unsigned data_devs = sp2d->data_devs; - unsigned group_width = data_devs + sp2d->parity; - int p, c; - - if (!sp2d->needed) - return; - - for (c = data_devs - 1; c >= 0; --c) - for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if (_1ps->page_is_read[c]) { - struct page *page = _1ps->pages[c]; - - r4w->put_page(priv, page); - _1ps->page_is_read[c] = false; - } - } - - for (p = 0; p < sp2d->pages_in_unit; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); - _1ps->write_count = 0; - _1ps->tx = NULL; - } - - sp2d->needed = false; -} - -static void _sp2d_free(struct __stripe_pages_2d *sp2d) -{ - unsigned i; - - if (!sp2d) - return; - - for (i = 0; i < sp2d->pages_in_unit; ++i) { - if (sp2d->_1p_stripes[i].alloc) - kfree(sp2d->_1p_stripes[i].pages); - } - - kfree(sp2d); -} - -static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) -{ - unsigned p; - - for (p = 0; p < sp2d->pages_in_unit; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if (_1ps->write_count) - return p; - } - - return ~0; -} - -static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) -{ - int p; - - for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if (_1ps->write_count) - return p; - } - - return ~0; -} - -static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) -{ - unsigned p; - unsigned tx_flags = ASYNC_TX_ACK; - - if (sp2d->parity == 1) - tx_flags |= ASYNC_TX_XOR_ZERO_DST; - - for (p = 0; p < sp2d->pages_in_unit; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if (!_1ps->write_count) - continue; - - init_async_submit(&_1ps->submit, tx_flags, - NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); - - if (sp2d->parity == 1) - _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], - _1ps->pages, 0, sp2d->data_devs, - PAGE_SIZE, &_1ps->submit); - else /* parity == 2 */ - _1ps->tx = async_gen_syndrome(_1ps->pages, 0, - sp2d->data_devs + sp2d->parity, - PAGE_SIZE, &_1ps->submit); - } - - for (p = 0; p < sp2d->pages_in_unit; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - /* NOTE: We wait for HW synchronously (I don't have such HW - * to test with.) Is parallelism needed with today's multi - * cores? - */ - async_tx_issue_pending(_1ps->tx); - } -} - -void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, - struct ore_striping_info *si, struct page *page) -{ - struct __1_page_stripe *_1ps; - - sp2d->needed = true; - - _1ps = &sp2d->_1p_stripes[si->cur_pg]; - _1ps->pages[si->cur_comp] = page; - ++_1ps->write_count; - - si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; - /* si->cur_comp is advanced outside at main loop */ -} - -void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, - bool not_last) -{ - struct osd_sg_entry *sge; - - ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " - "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", - per_dev->dev, cur_len, not_last, per_dev->cur_sg, - _LLU(per_dev->offset), per_dev->length, - per_dev->last_sgs_total); - - if (!per_dev->cur_sg) { - sge = per_dev->sglist; - - /* First time we prepare two entries */ - if (per_dev->length) { - ++per_dev->cur_sg; - sge->offset = per_dev->offset; - sge->len = per_dev->length; - } else { - /* Here the parity is the first unit of this object. - * This happens every time we reach a parity device on - * the same stripe as the per_dev->offset. We need to - * just skip this unit. - */ - per_dev->offset += cur_len; - return; - } - } else { - /* finalize the last one */ - sge = &per_dev->sglist[per_dev->cur_sg - 1]; - sge->len = per_dev->length - per_dev->last_sgs_total; - } - - if (not_last) { - /* Partly prepare the next one */ - struct osd_sg_entry *next_sge = sge + 1; - - ++per_dev->cur_sg; - next_sge->offset = sge->offset + sge->len + cur_len; - /* Save cur len so we know how mutch was added next time */ - per_dev->last_sgs_total = per_dev->length; - next_sge->len = 0; - } else if (!sge->len) { - /* Optimize for when the last unit is a parity */ - --per_dev->cur_sg; - } -} - -static int _alloc_read_4_write(struct ore_io_state *ios) -{ - struct ore_layout *layout = ios->layout; - int ret; - /* We want to only read those pages not in cache so worst case - * is a stripe populated with every other page - */ - unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; - - ret = _ore_get_io_state(layout, ios->oc, - layout->group_width * layout->mirrors_p1, - sgs_per_dev, 0, &ios->ios_read_4_write); - return ret; -} - -/* @si contains info of the to-be-inserted page. Update of @si should be - * maintained by caller. Specificaly si->dev, si->obj_offset, ... - */ -static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, - struct page *page, unsigned pg_len) -{ - struct request_queue *q; - struct ore_per_dev_state *per_dev; - struct ore_io_state *read_ios; - unsigned first_dev = si->dev - (si->dev % - (ios->layout->group_width * ios->layout->mirrors_p1)); - unsigned comp = si->dev - first_dev; - unsigned added_len; - - if (!ios->ios_read_4_write) { - int ret = _alloc_read_4_write(ios); - - if (unlikely(ret)) - return ret; - } - - read_ios = ios->ios_read_4_write; - read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; - - per_dev = &read_ios->per_dev[comp]; - if (!per_dev->length) { - per_dev->bio = bio_kmalloc(GFP_KERNEL, - ios->sp2d->pages_in_unit); - if (unlikely(!per_dev->bio)) { - ORE_DBGMSG("Failed to allocate BIO size=%u\n", - ios->sp2d->pages_in_unit); - return -ENOMEM; - } - per_dev->offset = si->obj_offset; - per_dev->dev = si->dev; - } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { - u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); - - _ore_add_sg_seg(per_dev, gap, true); - } - q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); - added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, - si->obj_offset % PAGE_SIZE); - if (unlikely(added_len != pg_len)) { - ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", - per_dev->bio->bi_vcnt); - return -ENOMEM; - } - - per_dev->length += pg_len; - return 0; -} - -/* read the beginning of an unaligned first page */ -static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) -{ - struct ore_striping_info si; - unsigned pg_len; - - ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); - - pg_len = si.obj_offset % PAGE_SIZE; - si.obj_offset -= pg_len; - - ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", - _LLU(si.obj_offset), pg_len, page->index, si.dev); - - return _add_to_r4w(ios, &si, page, pg_len); -} - -/* read the end of an incomplete last page */ -static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) -{ - struct ore_striping_info si; - struct page *page; - unsigned pg_len, p, c; - - ore_calc_stripe_info(ios->layout, *offset, 0, &si); - - p = si.cur_pg; - c = si.cur_comp; - page = ios->sp2d->_1p_stripes[p].pages[c]; - - pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); - *offset += pg_len; - - ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", - p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); - - BUG_ON(!page); - - return _add_to_r4w(ios, &si, page, pg_len); -} - -static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) -{ - struct bio_vec *bv; - unsigned i, d; - - /* loop on all devices all pages */ - for (d = 0; d < ios->numdevs; d++) { - struct bio *bio = ios->per_dev[d].bio; - struct bvec_iter_all iter_all; - - if (!bio) - continue; - - bio_for_each_segment_all(bv, bio, i, iter_all) { - struct page *page = bv->bv_page; - - SetPageUptodate(page); - if (PageError(page)) - ClearPageError(page); - } - } -} - -/* read_4_write is hacked to read the start of the first stripe and/or - * the end of the last stripe. If needed, with an sg-gap at each device/page. - * It is assumed to be called after the to_be_written pages of the first stripe - * are populating ios->sp2d[][] - * - * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations - * These pages are held at sp2d[p].pages[c] but with - * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are - * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is - * @uptodate=true, so we don't need to read it, only unlock, after IO. - * - * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then - * to-be-written count, we should consider the xor-in-place mode. - * need_to_read_pages_count is the actual number of pages not present in cache. - * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough - * approximation? In this mode the read pages are put in the empty places of - * ios->sp2d[p][*], xor is calculated the same way. These pages are - * allocated/freed and don't go through cache - */ -static int _read_4_write_first_stripe(struct ore_io_state *ios) -{ - struct ore_striping_info read_si; - struct __stripe_pages_2d *sp2d = ios->sp2d; - u64 offset = ios->si.first_stripe_start; - unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; - - if (offset == ios->offset) /* Go to start collect $200 */ - goto read_last_stripe; - - min_p = _sp2d_min_pg(sp2d); - max_p = _sp2d_max_pg(sp2d); - - ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", - offset, ios->offset, min_p, max_p); - - for (c = 0; ; c++) { - ore_calc_stripe_info(ios->layout, offset, 0, &read_si); - read_si.obj_offset += min_p * PAGE_SIZE; - offset += min_p * PAGE_SIZE; - for (p = min_p; p <= max_p; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - struct page **pp = &_1ps->pages[c]; - bool uptodate; - - if (*pp) { - if (ios->offset % PAGE_SIZE) - /* Read the remainder of the page */ - _add_to_r4w_first_page(ios, *pp); - /* to-be-written pages start here */ - goto read_last_stripe; - } - - *pp = ios->r4w->get_page(ios->private, offset, - &uptodate); - if (unlikely(!*pp)) - return -ENOMEM; - - if (!uptodate) - _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); - - /* Mark read-pages to be cache_released */ - _1ps->page_is_read[c] = true; - read_si.obj_offset += PAGE_SIZE; - offset += PAGE_SIZE; - } - offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; - } - -read_last_stripe: - return 0; -} - -static int _read_4_write_last_stripe(struct ore_io_state *ios) -{ - struct ore_striping_info read_si; - struct __stripe_pages_2d *sp2d = ios->sp2d; - u64 offset; - u64 last_stripe_end; - unsigned bytes_in_stripe = ios->si.bytes_in_stripe; - unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; - - offset = ios->offset + ios->length; - if (offset % PAGE_SIZE) - _add_to_r4w_last_page(ios, &offset); - /* offset will be aligned to next page */ - - last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) - * bytes_in_stripe; - if (offset == last_stripe_end) /* Optimize for the aligned case */ - goto read_it; - - ore_calc_stripe_info(ios->layout, offset, 0, &read_si); - p = read_si.cur_pg; - c = read_si.cur_comp; - - if (min_p == sp2d->pages_in_unit) { - /* Didn't do it yet */ - min_p = _sp2d_min_pg(sp2d); - max_p = _sp2d_max_pg(sp2d); - } - - ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", - offset, last_stripe_end, min_p, max_p); - - while (offset < last_stripe_end) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if ((min_p <= p) && (p <= max_p)) { - struct page *page; - bool uptodate; - - BUG_ON(_1ps->pages[c]); - page = ios->r4w->get_page(ios->private, offset, - &uptodate); - if (unlikely(!page)) - return -ENOMEM; - - _1ps->pages[c] = page; - /* Mark read-pages to be cache_released */ - _1ps->page_is_read[c] = true; - if (!uptodate) - _add_to_r4w(ios, &read_si, page, PAGE_SIZE); - } - - offset += PAGE_SIZE; - if (p == (sp2d->pages_in_unit - 1)) { - ++c; - p = 0; - ore_calc_stripe_info(ios->layout, offset, 0, &read_si); - } else { - read_si.obj_offset += PAGE_SIZE; - ++p; - } - } - -read_it: - return 0; -} - -static int _read_4_write_execute(struct ore_io_state *ios) -{ - struct ore_io_state *ios_read; - unsigned i; - int ret; - - ios_read = ios->ios_read_4_write; - if (!ios_read) - return 0; - - /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change - * to check for per_dev->bio - */ - ios_read->pages = ios->pages; - - /* Now read these devices */ - for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { - ret = _ore_read_mirror(ios_read, i); - if (unlikely(ret)) - return ret; - } - - ret = ore_io_execute(ios_read); /* Synchronus execution */ - if (unlikely(ret)) { - ORE_DBGMSG("!! ore_io_execute => %d\n", ret); - return ret; - } - - _mark_read4write_pages_uptodate(ios_read, ret); - ore_put_io_state(ios_read); - ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ - return 0; -} - -/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ -int _ore_add_parity_unit(struct ore_io_state *ios, - struct ore_striping_info *si, - struct ore_per_dev_state *per_dev, - unsigned cur_len, bool do_xor) -{ - if (ios->reading) { - if (per_dev->cur_sg >= ios->sgs_per_dev) { - ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" , - per_dev->cur_sg, ios->sgs_per_dev); - return -ENOMEM; - } - _ore_add_sg_seg(per_dev, cur_len, true); - } else { - struct __stripe_pages_2d *sp2d = ios->sp2d; - struct page **pages = ios->parity_pages + ios->cur_par_page; - unsigned num_pages; - unsigned array_start = 0; - unsigned i; - int ret; - - si->cur_pg = _sp2d_min_pg(sp2d); - num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; - - if (!per_dev->length) { - per_dev->offset += si->cur_pg * PAGE_SIZE; - /* If first stripe, Read in all read4write pages - * (if needed) before we calculate the first parity. - */ - if (do_xor) - _read_4_write_first_stripe(ios); - } - if (!cur_len && do_xor) - /* If last stripe r4w pages of last stripe */ - _read_4_write_last_stripe(ios); - _read_4_write_execute(ios); - - for (i = 0; i < num_pages; i++) { - pages[i] = _raid_page_alloc(); - if (unlikely(!pages[i])) - return -ENOMEM; - - ++(ios->cur_par_page); - } - - BUG_ON(si->cur_comp < sp2d->data_devs); - BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); - - ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, - per_dev, num_pages * PAGE_SIZE); - if (unlikely(ret)) - return ret; - - if (do_xor) { - _gen_xor_unit(sp2d); - _sp2d_reset(sp2d, ios->r4w, ios->private); - } - } - return 0; -} - -int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) -{ - if (ios->parity_pages) { - struct ore_layout *layout = ios->layout; - unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; - - if (_sp2d_alloc(pages_in_unit, layout->group_width, - layout->parity, &ios->sp2d)) { - return -ENOMEM; - } - } - return 0; -} - -void _ore_free_raid_stuff(struct ore_io_state *ios) -{ - if (ios->sp2d) { /* writing and raid */ - unsigned i; - - for (i = 0; i < ios->cur_par_page; i++) { - struct page *page = ios->parity_pages[i]; - - if (page) - _raid_page_free(page); - } - if (ios->extra_part_alloc) - kfree(ios->parity_pages); - /* If IO returned an error pages might need unlocking */ - _sp2d_reset(ios->sp2d, ios->r4w, ios->private); - _sp2d_free(ios->sp2d); - } else { - /* Will only be set if raid reading && sglist is big */ - if (ios->extra_part_alloc) - kfree(ios->per_dev[0].sglist); - } - if (ios->ios_read_4_write) - ore_put_io_state(ios->ios_read_4_write); -} diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h deleted file mode 100644 index a6e746775570..000000000000 --- a/fs/exofs/ore_raid.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (C) from 2011 - * Boaz Harrosh <ooo@electrozaur.com> - * - * This file is part of the objects raid engine (ore). - * - * It is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * You should have received a copy of the GNU General Public License - * along with "ore". If not, write to the Free Software Foundation, Inc: - * "Free Software Foundation <info@fsf.org>" - */ - -#include <scsi/osd_ore.h> - -#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) - -#ifdef CONFIG_EXOFS_DEBUG -#define ORE_DBGMSG(fmt, a...) \ - printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) -#else -#define ORE_DBGMSG(fmt, a...) \ - do { if (0) printk(fmt, ##a); } while (0) -#endif - -/* u64 has problems with printk this will cast it to unsigned long long */ -#define _LLU(x) (unsigned long long)(x) - -#define ORE_DBGMSG2(M...) do {} while (0) -/* #define ORE_DBGMSG2 ORE_DBGMSG */ - -/* ios_raid.c stuff needed by ios.c */ -int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); -void _ore_free_raid_stuff(struct ore_io_state *ios); - -void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, - bool not_last); -int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, - struct ore_per_dev_state *per_dev, unsigned cur_len, - bool do_xor); -void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, - struct ore_striping_info *si, struct page *page); -static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, - struct ore_striping_info *si, struct page *page) -{ - if (!sp2d) /* Inline the fast path */ - return; /* Hay no raid stuff */ - _ore_add_stripe_page(sp2d, si, page); -} - -/* ios.c stuff needed by ios_raid.c */ -int _ore_get_io_state(struct ore_layout *layout, - struct ore_components *oc, unsigned numdevs, - unsigned sgs_per_dev, unsigned num_par_pages, - struct ore_io_state **pios); -int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct page **pages, - struct ore_per_dev_state *per_dev, int cur_len); -int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); -int ore_io_execute(struct ore_io_state *ios); diff --git a/fs/exofs/super.c b/fs/exofs/super.c deleted file mode 100644 index fc80c7233fa5..000000000000 --- a/fs/exofs/super.c +++ /dev/null @@ -1,1071 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/string.h> -#include <linux/parser.h> -#include <linux/vfs.h> -#include <linux/random.h> -#include <linux/module.h> -#include <linux/exportfs.h> -#include <linux/slab.h> -#include <linux/iversion.h> - -#include "exofs.h" - -#define EXOFS_DBGMSG2(M...) do {} while (0) - -/****************************************************************************** - * MOUNT OPTIONS - *****************************************************************************/ - -/* - * struct to hold what we get from mount options - */ -struct exofs_mountopt { - bool is_osdname; - const char *dev_name; - uint64_t pid; - int timeout; -}; - -/* - * exofs-specific mount-time options. - */ -enum { Opt_name, Opt_pid, Opt_to, Opt_err }; - -/* - * Our mount-time options. These should ideally be 64-bit unsigned, but the - * kernel's parsing functions do not currently support that. 32-bit should be - * sufficient for most applications now. - */ -static match_table_t tokens = { - {Opt_name, "osdname=%s"}, - {Opt_pid, "pid=%u"}, - {Opt_to, "to=%u"}, - {Opt_err, NULL} -}; - -/* - * The main option parsing method. Also makes sure that all of the mandatory - * mount options were set. - */ -static int parse_options(char *options, struct exofs_mountopt *opts) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - bool s_pid = false; - - EXOFS_DBGMSG("parse_options %s\n", options); - /* defaults */ - memset(opts, 0, sizeof(*opts)); - opts->timeout = BLK_DEFAULT_SG_TIMEOUT; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - char str[32]; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_name: - kfree(opts->dev_name); - opts->dev_name = match_strdup(&args[0]); - if (unlikely(!opts->dev_name)) { - EXOFS_ERR("Error allocating dev_name"); - return -ENOMEM; - } - opts->is_osdname = true; - break; - case Opt_pid: - if (0 == match_strlcpy(str, &args[0], sizeof(str))) - return -EINVAL; - opts->pid = simple_strtoull(str, NULL, 0); - if (opts->pid < EXOFS_MIN_PID) { - EXOFS_ERR("Partition ID must be >= %u", - EXOFS_MIN_PID); - return -EINVAL; - } - s_pid = true; - break; - case Opt_to: - if (match_int(&args[0], &option)) - return -EINVAL; - if (option <= 0) { - EXOFS_ERR("Timeout must be > 0"); - return -EINVAL; - } - opts->timeout = option * HZ; - break; - } - } - - if (!s_pid) { - EXOFS_ERR("Need to specify the following options:\n"); - EXOFS_ERR(" -o pid=pid_no_to_use\n"); - return -EINVAL; - } - - return 0; -} - -/****************************************************************************** - * INODE CACHE - *****************************************************************************/ - -/* - * Our inode cache. Isn't it pretty? - */ -static struct kmem_cache *exofs_inode_cachep; - -/* - * Allocate an inode in the cache - */ -static struct inode *exofs_alloc_inode(struct super_block *sb) -{ - struct exofs_i_info *oi; - - oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL); - if (!oi) - return NULL; - - inode_set_iversion(&oi->vfs_inode, 1); - return &oi->vfs_inode; -} - -static void exofs_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); -} - -/* - * Remove an inode from the cache - */ -static void exofs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, exofs_i_callback); -} - -/* - * Initialize the inode - */ -static void exofs_init_once(void *foo) -{ - struct exofs_i_info *oi = foo; - - inode_init_once(&oi->vfs_inode); -} - -/* - * Create and initialize the inode cache - */ -static int init_inodecache(void) -{ - exofs_inode_cachep = kmem_cache_create_usercopy("exofs_inode_cache", - sizeof(struct exofs_i_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | - SLAB_ACCOUNT, - offsetof(struct exofs_i_info, i_data), - sizeof_field(struct exofs_i_info, i_data), - exofs_init_once); - if (exofs_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -/* - * Destroy the inode cache - */ -static void destroy_inodecache(void) -{ - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(exofs_inode_cachep); -} - -/****************************************************************************** - * Some osd helpers - *****************************************************************************/ -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) -{ - osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); -} - -static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, - u64 offset, void *p, unsigned length) -{ - struct osd_request *or = osd_start_request(od); -/* struct osd_sense_info osi = {.key = 0};*/ - int ret; - - if (unlikely(!or)) { - EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); - return -ENOMEM; - } - ret = osd_req_read_kern(or, obj, offset, p, length); - if (unlikely(ret)) { - EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); - goto out; - } - - ret = osd_finalize_request(or, 0, cred, NULL); - if (unlikely(ret)) { - EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); - goto out; - } - - ret = osd_execute_request(or); - if (unlikely(ret)) - EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); - /* osd_req_decode_sense(or, ret); */ - -out: - osd_end_request(or); - EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%p ret=>%d\n", - _LLU(obj->id), _LLU(offset), _LLU(length), od, ret); - return ret; -} - -static const struct osd_attr g_attr_sb_stats = ATTR_DEF( - EXOFS_APAGE_SB_DATA, - EXOFS_ATTR_SB_STATS, - sizeof(struct exofs_sb_stats)); - -static int __sbi_read_stats(struct exofs_sb_info *sbi) -{ - struct osd_attr attrs[] = { - [0] = g_attr_sb_stats, - }; - struct ore_io_state *ios; - int ret; - - ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); - return ret; - } - - ios->in_attr = attrs; - ios->in_attr_len = ARRAY_SIZE(attrs); - - ret = ore_read(ios); - if (unlikely(ret)) { - EXOFS_ERR("Error reading super_block stats => %d\n", ret); - goto out; - } - - ret = extract_attr_from_ios(ios, &attrs[0]); - if (ret) { - EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__); - goto out; - } - if (attrs[0].len) { - struct exofs_sb_stats *ess; - - if (unlikely(attrs[0].len != sizeof(*ess))) { - EXOFS_ERR("%s: Wrong version of exofs_sb_stats " - "size(%d) != expected(%zd)\n", - __func__, attrs[0].len, sizeof(*ess)); - goto out; - } - - ess = attrs[0].val_ptr; - sbi->s_nextid = le64_to_cpu(ess->s_nextid); - sbi->s_numfiles = le32_to_cpu(ess->s_numfiles); - } - -out: - ore_put_io_state(ios); - return ret; -} - -static void stats_done(struct ore_io_state *ios, void *p) -{ - ore_put_io_state(ios); - /* Good thanks nothing to do anymore */ -} - -/* Asynchronously write the stats attribute */ -int exofs_sbi_write_stats(struct exofs_sb_info *sbi) -{ - struct osd_attr attrs[] = { - [0] = g_attr_sb_stats, - }; - struct ore_io_state *ios; - int ret; - - ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); - return ret; - } - - sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid); - sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); - attrs[0].val_ptr = &sbi->s_ess; - - - ios->done = stats_done; - ios->private = sbi; - ios->out_attr = attrs; - ios->out_attr_len = ARRAY_SIZE(attrs); - - ret = ore_write(ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_write failed.\n", __func__); - ore_put_io_state(ios); - } - - return ret; -} - -/****************************************************************************** - * SUPERBLOCK FUNCTIONS - *****************************************************************************/ -static const struct super_operations exofs_sops; -static const struct export_operations exofs_export_ops; - -/* - * Write the superblock to the OSD - */ -static int exofs_sync_fs(struct super_block *sb, int wait) -{ - struct exofs_sb_info *sbi; - struct exofs_fscb *fscb; - struct ore_comp one_comp; - struct ore_components oc; - struct ore_io_state *ios; - int ret = -ENOMEM; - - fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); - if (unlikely(!fscb)) - return -ENOMEM; - - sbi = sb->s_fs_info; - - /* NOTE: We no longer dirty the super_block anywhere in exofs. The - * reason we write the fscb here on unmount is so we can stay backwards - * compatible with fscb->s_version == 1. (What we are not compatible - * with is if a new version FS crashed and then we try to mount an old - * version). Otherwise the exofs_fscb is read-only from mkfs time. All - * the writeable info is set in exofs_sbi_write_stats() above. - */ - - exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); - - ret = ore_get_io_state(&sbi->layout, &oc, &ios); - if (unlikely(ret)) - goto out; - - ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); - memset(fscb, 0, ios->length); - fscb->s_nextid = cpu_to_le64(sbi->s_nextid); - fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles); - fscb->s_magic = cpu_to_le16(sb->s_magic); - fscb->s_newfs = 0; - fscb->s_version = EXOFS_FSCB_VER; - - ios->offset = 0; - ios->kern_buff = fscb; - - ret = ore_write(ios); - if (unlikely(ret)) - EXOFS_ERR("%s: ore_write failed.\n", __func__); - -out: - EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); - ore_put_io_state(ios); - kfree(fscb); - return ret; -} - -static void _exofs_print_device(const char *msg, const char *dev_path, - struct osd_dev *od, u64 pid) -{ - const struct osd_dev_info *odi = osduld_device_info(od); - - printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n", - msg, dev_path ?: "", odi->osdname, _LLU(pid)); -} - -static void exofs_free_sbi(struct exofs_sb_info *sbi) -{ - unsigned numdevs = sbi->oc.numdevs; - - while (numdevs) { - unsigned i = --numdevs; - struct osd_dev *od = ore_comp_dev(&sbi->oc, i); - - if (od) { - ore_comp_set_dev(&sbi->oc, i, NULL); - osduld_put_device(od); - } - } - kfree(sbi->oc.ods); - kfree(sbi); -} - -/* - * This function is called when the vfs is freeing the superblock. We just - * need to free our own part. - */ -static void exofs_put_super(struct super_block *sb) -{ - int num_pend; - struct exofs_sb_info *sbi = sb->s_fs_info; - - /* make sure there are no pending commands */ - for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; - num_pend = atomic_read(&sbi->s_curr_pending)) { - wait_queue_head_t wq; - - printk(KERN_NOTICE "%s: !!Pending operations in flight. " - "This is a BUG. please report to osd-dev@open-osd.org\n", - __func__); - init_waitqueue_head(&wq); - wait_event_timeout(wq, - (atomic_read(&sbi->s_curr_pending) == 0), - msecs_to_jiffies(100)); - } - - _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), - sbi->one_comp.obj.partition); - - exofs_sysfs_sb_del(sbi); - exofs_free_sbi(sbi); - sb->s_fs_info = NULL; -} - -static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, - struct exofs_device_table *dt) -{ - int ret; - - sbi->layout.stripe_unit = - le64_to_cpu(dt->dt_data_map.cb_stripe_unit); - sbi->layout.group_width = - le32_to_cpu(dt->dt_data_map.cb_group_width); - sbi->layout.group_depth = - le32_to_cpu(dt->dt_data_map.cb_group_depth); - sbi->layout.mirrors_p1 = - le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; - sbi->layout.raid_algorithm = - le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); - - ret = ore_verify_layout(numdevs, &sbi->layout); - - EXOFS_DBGMSG("exofs: layout: " - "num_comps=%u stripe_unit=0x%x group_width=%u " - "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n", - numdevs, - sbi->layout.stripe_unit, - sbi->layout.group_width, - _LLU(sbi->layout.group_depth), - sbi->layout.mirrors_p1, - sbi->layout.raid_algorithm); - return ret; -} - -static unsigned __ra_pages(struct ore_layout *layout) -{ - const unsigned _MIN_RA = 32; /* min 128K read-ahead */ - unsigned ra_pages = layout->group_width * layout->stripe_unit / - PAGE_SIZE; - unsigned max_io_pages = exofs_max_io_pages(layout, ~0); - - ra_pages *= 2; /* two stripes */ - if (ra_pages < _MIN_RA) - ra_pages = roundup(_MIN_RA, ra_pages / 2); - - if (ra_pages > max_io_pages) - ra_pages = max_io_pages; - - return ra_pages; -} - -/* @odi is valid only as long as @fscb_dev is valid */ -static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, - struct osd_dev_info *odi) -{ - odi->systemid_len = le32_to_cpu(dt_dev->systemid_len); - if (likely(odi->systemid_len)) - memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN); - - odi->osdname_len = le32_to_cpu(dt_dev->osdname_len); - odi->osdname = dt_dev->osdname; - - /* FIXME support long names. Will need a _put function */ - if (dt_dev->long_name_offset) - return -EINVAL; - - /* Make sure osdname is printable! - * mkexofs should give us space for a null-terminator else the - * device-table is invalid. - */ - if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname))) - odi->osdname_len = sizeof(dt_dev->osdname) - 1; - dt_dev->osdname[odi->osdname_len] = 0; - - /* If it's all zeros something is bad we read past end-of-obj */ - return !(odi->systemid_len || odi->osdname_len); -} - -static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, - struct exofs_dev **peds) -{ - /* Twice bigger table: See exofs_init_comps() and comment at - * exofs_read_lookup_dev_table() - */ - const size_t numores = numdevs * 2 - 1; - struct exofs_dev *eds; - unsigned i; - - sbi->oc.ods = kzalloc(numores * sizeof(struct ore_dev *) + - numdevs * sizeof(struct exofs_dev), GFP_KERNEL); - if (unlikely(!sbi->oc.ods)) { - EXOFS_ERR("ERROR: failed allocating Device array[%d]\n", - numdevs); - return -ENOMEM; - } - - /* Start of allocated struct exofs_dev entries */ - *peds = eds = (void *)sbi->oc.ods[numores]; - /* Initialize pointers into struct exofs_dev */ - for (i = 0; i < numdevs; ++i) - sbi->oc.ods[i] = &eds[i].ored; - return 0; -} - -static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, - struct osd_dev *fscb_od, - unsigned table_count) -{ - struct ore_comp comp; - struct exofs_device_table *dt; - struct exofs_dev *eds; - unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + - sizeof(*dt); - unsigned numdevs, i; - int ret; - - dt = kmalloc(table_bytes, GFP_KERNEL); - if (unlikely(!dt)) { - EXOFS_ERR("ERROR: allocating %x bytes for device table\n", - table_bytes); - return -ENOMEM; - } - - sbi->oc.numdevs = 0; - - comp.obj.partition = sbi->one_comp.obj.partition; - comp.obj.id = EXOFS_DEVTABLE_ID; - exofs_make_credential(comp.cred, &comp.obj); - - ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt, - table_bytes); - if (unlikely(ret)) { - EXOFS_ERR("ERROR: reading device table\n"); - goto out; - } - - numdevs = le64_to_cpu(dt->dt_num_devices); - if (unlikely(!numdevs)) { - ret = -EINVAL; - goto out; - } - WARN_ON(table_count != numdevs); - - ret = _read_and_match_data_map(sbi, numdevs, dt); - if (unlikely(ret)) - goto out; - - ret = __alloc_dev_table(sbi, numdevs, &eds); - if (unlikely(ret)) - goto out; - /* exofs round-robins the device table view according to inode - * number. We hold a: twice bigger table hence inodes can point - * to any device and have a sequential view of the table - * starting at this device. See exofs_init_comps() - */ - memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], - (numdevs - 1) * sizeof(sbi->oc.ods[0])); - - /* create sysfs subdir under which we put the device table - * And cluster layout. A Superblock is identified by the string: - * "dev[0].osdname"_"pid" - */ - exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]); - - for (i = 0; i < numdevs; i++) { - struct exofs_fscb fscb; - struct osd_dev_info odi; - struct osd_dev *od; - - if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) { - EXOFS_ERR("ERROR: Read all-zeros device entry\n"); - ret = -EINVAL; - goto out; - } - - printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", - i, odi.osdname); - - /* the exofs id is currently the table index */ - eds[i].did = i; - - /* On all devices the device table is identical. The user can - * specify any one of the participating devices on the command - * line. We always keep them in device-table order. - */ - if (fscb_od && osduld_device_same(fscb_od, &odi)) { - eds[i].ored.od = fscb_od; - ++sbi->oc.numdevs; - fscb_od = NULL; - exofs_sysfs_odev_add(&eds[i], sbi); - continue; - } - - od = osduld_info_lookup(&odi); - if (IS_ERR(od)) { - ret = PTR_ERR(od); - EXOFS_ERR("ERROR: device requested is not found " - "osd_name-%s =>%d\n", odi.osdname, ret); - goto out; - } - - eds[i].ored.od = od; - ++sbi->oc.numdevs; - - /* Read the fscb of the other devices to make sure the FS - * partition is there. - */ - ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, - sizeof(fscb)); - if (unlikely(ret)) { - EXOFS_ERR("ERROR: Malformed participating device " - "error reading fscb osd_name-%s\n", - odi.osdname); - goto out; - } - exofs_sysfs_odev_add(&eds[i], sbi); - - /* TODO: verify other information is correct and FS-uuid - * matches. Benny what did you say about device table - * generation and old devices? - */ - } - -out: - kfree(dt); - if (unlikely(fscb_od && !ret)) { - EXOFS_ERR("ERROR: Bad device-table container device not present\n"); - osduld_put_device(fscb_od); - return -EINVAL; - } - return ret; -} - -/* - * Read the superblock from the OSD and fill in the fields - */ -static int exofs_fill_super(struct super_block *sb, - struct exofs_mountopt *opts, - struct exofs_sb_info *sbi, - int silent) -{ - struct inode *root; - struct osd_dev *od; /* Master device */ - struct exofs_fscb fscb; /*on-disk superblock info */ - struct ore_comp comp; - unsigned table_count; - int ret; - - /* use mount options to fill superblock */ - if (opts->is_osdname) { - struct osd_dev_info odi = {.systemid_len = 0}; - - odi.osdname_len = strlen(opts->dev_name); - odi.osdname = (u8 *)opts->dev_name; - od = osduld_info_lookup(&odi); - kfree(opts->dev_name); - opts->dev_name = NULL; - } else { - od = osduld_path_lookup(opts->dev_name); - } - if (IS_ERR(od)) { - ret = -EINVAL; - goto free_sbi; - } - - /* Default layout in case we do not have a device-table */ - sbi->layout.stripe_unit = PAGE_SIZE; - sbi->layout.mirrors_p1 = 1; - sbi->layout.group_width = 1; - sbi->layout.group_depth = -1; - sbi->layout.group_count = 1; - sbi->s_timeout = opts->timeout; - - sbi->one_comp.obj.partition = opts->pid; - sbi->one_comp.obj.id = 0; - exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); - sbi->oc.single_comp = EC_SINGLE_COMP; - sbi->oc.comps = &sbi->one_comp; - - /* fill in some other data by hand */ - memset(sb->s_id, 0, sizeof(sb->s_id)); - strcpy(sb->s_id, "exofs"); - sb->s_blocksize = EXOFS_BLKSIZE; - sb->s_blocksize_bits = EXOFS_BLKSHIFT; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_max_links = EXOFS_LINK_MAX; - atomic_set(&sbi->s_curr_pending, 0); - sb->s_bdev = NULL; - sb->s_dev = 0; - - comp.obj.partition = sbi->one_comp.obj.partition; - comp.obj.id = EXOFS_SUPER_ID; - exofs_make_credential(comp.cred, &comp.obj); - - ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb)); - if (unlikely(ret)) - goto free_sbi; - - sb->s_magic = le16_to_cpu(fscb.s_magic); - /* NOTE: we read below to be backward compatible with old versions */ - sbi->s_nextid = le64_to_cpu(fscb.s_nextid); - sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); - - /* make sure what we read from the object store is correct */ - if (sb->s_magic != EXOFS_SUPER_MAGIC) { - if (!silent) - EXOFS_ERR("ERROR: Bad magic value\n"); - ret = -EINVAL; - goto free_sbi; - } - if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) { - EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", - EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); - ret = -EINVAL; - goto free_sbi; - } - - /* start generation numbers from a random point */ - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); - spin_lock_init(&sbi->s_next_gen_lock); - - table_count = le64_to_cpu(fscb.s_dev_table_count); - if (table_count) { - ret = exofs_read_lookup_dev_table(sbi, od, table_count); - if (unlikely(ret)) - goto free_sbi; - } else { - struct exofs_dev *eds; - - ret = __alloc_dev_table(sbi, 1, &eds); - if (unlikely(ret)) - goto free_sbi; - - ore_comp_set_dev(&sbi->oc, 0, od); - sbi->oc.numdevs = 1; - } - - __sbi_read_stats(sbi); - - /* set up operation vectors */ - ret = super_setup_bdi(sb); - if (ret) { - EXOFS_DBGMSG("Failed to super_setup_bdi\n"); - goto free_sbi; - } - sb->s_bdi->ra_pages = __ra_pages(&sbi->layout); - sb->s_fs_info = sbi; - sb->s_op = &exofs_sops; - sb->s_export_op = &exofs_export_ops; - root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); - if (IS_ERR(root)) { - EXOFS_ERR("ERROR: exofs_iget failed\n"); - ret = PTR_ERR(root); - goto free_sbi; - } - sb->s_root = d_make_root(root); - if (!sb->s_root) { - EXOFS_ERR("ERROR: get root inode failed\n"); - ret = -ENOMEM; - goto free_sbi; - } - - if (!S_ISDIR(root->i_mode)) { - dput(sb->s_root); - sb->s_root = NULL; - EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n", - root->i_mode); - ret = -EINVAL; - goto free_sbi; - } - - exofs_sysfs_dbg_print(); - _exofs_print_device("Mounting", opts->dev_name, - ore_comp_dev(&sbi->oc, 0), - sbi->one_comp.obj.partition); - return 0; - -free_sbi: - EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", - opts->dev_name, sbi->one_comp.obj.partition, ret); - exofs_free_sbi(sbi); - return ret; -} - -/* - * Set up the superblock (calls exofs_fill_super eventually) - */ -static struct dentry *exofs_mount(struct file_system_type *type, - int flags, const char *dev_name, - void *data) -{ - struct super_block *s; - struct exofs_mountopt opts; - struct exofs_sb_info *sbi; - int ret; - - ret = parse_options(data, &opts); - if (ret) { - kfree(opts.dev_name); - return ERR_PTR(ret); - } - - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) { - kfree(opts.dev_name); - return ERR_PTR(-ENOMEM); - } - - s = sget(type, NULL, set_anon_super, flags, NULL); - - if (IS_ERR(s)) { - kfree(opts.dev_name); - kfree(sbi); - return ERR_CAST(s); - } - - if (!opts.dev_name) - opts.dev_name = dev_name; - - - ret = exofs_fill_super(s, &opts, sbi, flags & SB_SILENT ? 1 : 0); - if (ret) { - deactivate_locked_super(s); - return ERR_PTR(ret); - } - s->s_flags |= SB_ACTIVE; - return dget(s->s_root); -} - -/* - * Return information about the file system state in the buffer. This is used - * by the 'df' command, for example. - */ -static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_sb; - struct exofs_sb_info *sbi = sb->s_fs_info; - struct ore_io_state *ios; - struct osd_attr attrs[] = { - ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, - OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), - ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION, - OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)), - }; - uint64_t capacity = ULLONG_MAX; - uint64_t used = ULLONG_MAX; - int ret; - - ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); - if (ret) { - EXOFS_DBGMSG("ore_get_io_state failed.\n"); - return ret; - } - - ios->in_attr = attrs; - ios->in_attr_len = ARRAY_SIZE(attrs); - - ret = ore_read(ios); - if (unlikely(ret)) - goto out; - - ret = extract_attr_from_ios(ios, &attrs[0]); - if (likely(!ret)) { - capacity = get_unaligned_be64(attrs[0].val_ptr); - if (unlikely(!capacity)) - capacity = ULLONG_MAX; - } else - EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); - - ret = extract_attr_from_ios(ios, &attrs[1]); - if (likely(!ret)) - used = get_unaligned_be64(attrs[1].val_ptr); - else - EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n"); - - /* fill in the stats buffer */ - buf->f_type = EXOFS_SUPER_MAGIC; - buf->f_bsize = EXOFS_BLKSIZE; - buf->f_blocks = capacity >> 9; - buf->f_bfree = (capacity - used) >> 9; - buf->f_bavail = buf->f_bfree; - buf->f_files = sbi->s_numfiles; - buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; - buf->f_namelen = EXOFS_NAME_LEN; - -out: - ore_put_io_state(ios); - return ret; -} - -static const struct super_operations exofs_sops = { - .alloc_inode = exofs_alloc_inode, - .destroy_inode = exofs_destroy_inode, - .write_inode = exofs_write_inode, - .evict_inode = exofs_evict_inode, - .put_super = exofs_put_super, - .sync_fs = exofs_sync_fs, - .statfs = exofs_statfs, -}; - -/****************************************************************************** - * EXPORT OPERATIONS - *****************************************************************************/ - -static struct dentry *exofs_get_parent(struct dentry *child) -{ - unsigned long ino = exofs_parent_ino(child); - - if (!ino) - return ERR_PTR(-ESTALE); - - return d_obtain_alias(exofs_iget(child->d_sb, ino)); -} - -static struct inode *exofs_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) -{ - struct inode *inode; - - inode = exofs_iget(sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { - /* we didn't find the right inode.. */ - iput(inode); - return ERR_PTR(-ESTALE); - } - return inode; -} - -static struct dentry *exofs_fh_to_dentry(struct super_block *sb, - struct fid *fid, int fh_len, int fh_type) -{ - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - exofs_nfs_get_inode); -} - -static struct dentry *exofs_fh_to_parent(struct super_block *sb, - struct fid *fid, int fh_len, int fh_type) -{ - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - exofs_nfs_get_inode); -} - -static const struct export_operations exofs_export_ops = { - .fh_to_dentry = exofs_fh_to_dentry, - .fh_to_parent = exofs_fh_to_parent, - .get_parent = exofs_get_parent, -}; - -/****************************************************************************** - * INSMOD/RMMOD - *****************************************************************************/ - -/* - * struct that describes this file system - */ -static struct file_system_type exofs_type = { - .owner = THIS_MODULE, - .name = "exofs", - .mount = exofs_mount, - .kill_sb = generic_shutdown_super, -}; -MODULE_ALIAS_FS("exofs"); - -static int __init init_exofs(void) -{ - int err; - - err = init_inodecache(); - if (err) - goto out; - - err = register_filesystem(&exofs_type); - if (err) - goto out_d; - - /* We don't fail if sysfs creation failed */ - exofs_sysfs_init(); - - return 0; -out_d: - destroy_inodecache(); -out: - return err; -} - -static void __exit exit_exofs(void) -{ - exofs_sysfs_uninit(); - unregister_filesystem(&exofs_type); - destroy_inodecache(); -} - -MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>"); -MODULE_DESCRIPTION("exofs"); -MODULE_LICENSE("GPL"); - -module_init(init_exofs) -module_exit(exit_exofs) diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c deleted file mode 100644 index 1f7d5e46cdda..000000000000 --- a/fs/exofs/sys.c +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (C) 2012 - * Sachin Bhamare <sbhamare@panasas.com> - * Boaz Harrosh <ooo@electrozaur.com> - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License 2 as published by - * the Free Software Foundation. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the: - * Free Software Foundation <licensing@fsf.org> - */ - -#include <linux/kobject.h> -#include <linux/device.h> - -#include "exofs.h" - -struct odev_attr { - struct attribute attr; - ssize_t (*show)(struct exofs_dev *, char *); - ssize_t (*store)(struct exofs_dev *, const char *, size_t); -}; - -static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); - struct odev_attr *a = container_of(attr, struct odev_attr, attr); - - return a->show ? a->show(edp, buf) : 0; -} - -static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); - struct odev_attr *a = container_of(attr, struct odev_attr, attr); - - return a->store ? a->store(edp, buf, len) : len; -} - -static const struct sysfs_ops odev_attr_ops = { - .show = odev_attr_show, - .store = odev_attr_store, -}; - - -static struct kset *exofs_kset; - -static ssize_t osdname_show(struct exofs_dev *edp, char *buf) -{ - struct osd_dev *odev = edp->ored.od; - const struct osd_dev_info *odi = osduld_device_info(odev); - - return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname); -} - -static ssize_t systemid_show(struct exofs_dev *edp, char *buf) -{ - struct osd_dev *odev = edp->ored.od; - const struct osd_dev_info *odi = osduld_device_info(odev); - - memcpy(buf, odi->systemid, odi->systemid_len); - return odi->systemid_len; -} - -static ssize_t uri_show(struct exofs_dev *edp, char *buf) -{ - return snprintf(buf, edp->urilen, "%s", edp->uri); -} - -static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len) -{ - uint8_t *new_uri; - - edp->urilen = strlen(buf) + 1; - new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL); - if (new_uri == NULL) - return -ENOMEM; - edp->uri = new_uri; - strncpy(edp->uri, buf, edp->urilen); - return edp->urilen; -} - -#define OSD_ATTR(name, mode, show, store) \ - static struct odev_attr odev_attr_##name = \ - __ATTR(name, mode, show, store) - -OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL); -OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL); -OSD_ATTR(uri, S_IRWXU, uri_show, uri_store); - -static struct attribute *odev_attrs[] = { - &odev_attr_osdname.attr, - &odev_attr_systemid.attr, - &odev_attr_uri.attr, - NULL, -}; - -static struct kobj_type odev_ktype = { - .default_attrs = odev_attrs, - .sysfs_ops = &odev_attr_ops, -}; - -static struct kobj_type uuid_ktype = { -}; - -void exofs_sysfs_dbg_print(void) -{ -#ifdef CONFIG_EXOFS_DEBUG - struct kobject *k_name, *k_tmp; - - list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { - printk(KERN_INFO "%s: name %s ref %d\n", - __func__, kobject_name(k_name), - (int)kref_read(&k_name->kref)); - } -#endif -} -/* - * This function removes all kobjects under exofs_kset - * At the end of it, exofs_kset kobject will have a refcount - * of 1 which gets decremented only on exofs module unload - */ -void exofs_sysfs_sb_del(struct exofs_sb_info *sbi) -{ - struct kobject *k_name, *k_tmp; - struct kobject *s_kobj = &sbi->s_kobj; - - list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { - /* Remove all that are children of this SBI */ - if (k_name->parent == s_kobj) - kobject_put(k_name); - } - kobject_put(s_kobj); -} - -/* - * This function creates sysfs entries to hold the current exofs cluster - * instance (uniquely identified by osdname,pid tuple). - * This function gets called once per exofs mount instance. - */ -int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, - struct exofs_dt_device_info *dt_dev) -{ - struct kobject *s_kobj; - int retval = 0; - uint64_t pid = sbi->one_comp.obj.partition; - - /* allocate new uuid dirent */ - s_kobj = &sbi->s_kobj; - s_kobj->kset = exofs_kset; - retval = kobject_init_and_add(s_kobj, &uuid_ktype, - &exofs_kset->kobj, "%s_%llx", dt_dev->osdname, pid); - if (retval) { - EXOFS_ERR("ERROR: Failed to create sysfs entry for " - "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval); - return -ENOMEM; - } - return 0; -} - -int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi) -{ - struct kobject *d_kobj; - int retval = 0; - - /* create osd device group which contains following attributes - * osdname, systemid & uri - */ - d_kobj = &edev->ed_kobj; - d_kobj->kset = exofs_kset; - retval = kobject_init_and_add(d_kobj, &odev_ktype, - &sbi->s_kobj, "dev%u", edev->did); - if (retval) { - EXOFS_ERR("ERROR: Failed to create sysfs entry for " - "device dev%u\n", edev->did); - return retval; - } - return 0; -} - -int exofs_sysfs_init(void) -{ - exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj); - if (!exofs_kset) { - EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n"); - return -ENOMEM; - } - return 0; -} - -void exofs_sysfs_uninit(void) -{ - kset_unregister(exofs_kset); -} diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index a453cc87082b..06f77ca7f36e 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -96,23 +96,8 @@ config EXT4_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. -config EXT4_ENCRYPTION - bool "Ext4 Encryption" - depends on EXT4_FS - select FS_ENCRYPTION - help - Enable encryption of ext4 files and directories. This - feature is similar to ecryptfs, but it is more memory - efficient since it avoids caching the encrypted and - decrypted pages in the page cache. - -config EXT4_FS_ENCRYPTION - bool - default y - depends on EXT4_ENCRYPTION - config EXT4_DEBUG - bool "EXT4 debugging support" + bool "Ext4 debugging support" depends on EXT4_FS help Enables run-time debugging support for the ext4 filesystem. diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index f93f9881ec18..0ccd51f72048 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -111,7 +111,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int dir_has_error = 0; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); - if (ext4_encrypted_inode(inode)) { + if (IS_ENCRYPTED(inode)) { err = fscrypt_get_encryption_info(inode); if (err && err != -ENOKEY) return err; @@ -138,7 +138,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; } - if (ext4_encrypted_inode(inode)) { + if (IS_ENCRYPTED(inode)) { err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr); if (err < 0) return err; @@ -245,7 +245,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - if (!ext4_encrypted_inode(inode)) { + if (!IS_ENCRYPTED(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), @@ -283,9 +283,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) done: err = 0; errout: -#ifdef CONFIG_EXT4_FS_ENCRYPTION fscrypt_fname_free_buffer(&fstr); -#endif brelse(bh); return err; } @@ -613,7 +611,7 @@ finished: static int ext4_dir_open(struct inode * inode, struct file * filp) { - if (ext4_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) return fscrypt_get_encryption_info(inode) ? -EACCES : 0; return 0; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 185a05d3257e..0833b5fc0668 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -40,7 +40,6 @@ #include <linux/compat.h> #endif -#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION) #include <linux/fscrypt.h> #include <linux/compiler.h> @@ -426,6 +425,9 @@ struct flex_groups { /* Flags that are appropriate for non-directories/regular files. */ #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { @@ -1326,7 +1328,7 @@ struct ext4_super_block { #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ #define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ EXT4_MF_TEST_DUMMY_ENCRYPTION)) #else @@ -1662,6 +1664,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +extern void ext4_update_dynamic_rev(struct super_block *sb); + #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ @@ -1670,6 +1674,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_compat |= \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } \ @@ -1687,6 +1692,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } \ @@ -1704,6 +1710,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_incompat |= \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } \ @@ -2017,7 +2024,6 @@ static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); desc.shash.tfm = sbi->s_chksum_driver; - desc.shash.flags = 0; *(u32 *)desc.ctx = crc; BUG_ON(crypto_shash_update(&desc.shash, address, length)); @@ -2051,7 +2057,7 @@ struct ext4_filename { const struct qstr *usr_fname; struct fscrypt_str disk_name; struct dx_hash_info hinfo; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION struct fscrypt_str crypto_buf; #endif }; @@ -2279,12 +2285,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -static inline bool ext4_encrypted_inode(struct inode *inode) -{ - return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); -} - -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION static inline int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) @@ -2672,7 +2673,6 @@ do { \ #endif -extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); extern int ext4_update_rocompat_feature(handle_t *handle, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 15b6dd733780..75a5309f2231 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -384,7 +384,7 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, { struct ext4_inode_info *ei = EXT4_I(inode); - if (ext4_handle_valid(handle)) { + if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; if (datasync) ei->i_datasync_tid = handle->h_transaction->t_tid; @@ -411,7 +411,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode) (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && !test_opt(inode->i_sb, DELALLOC))) { /* We do not support data journalling for encrypted data */ - if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode)) + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 240b6dea5441..0f89f5190cd7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2956,14 +2956,17 @@ again: if (err < 0) goto out; - } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) { + } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && + partial.state == initial) { /* - * If there's an extent to the right its first cluster - * contains the immediate right boundary of the - * truncated/punched region. Set partial_cluster to - * its negative value so it won't be freed if shared - * with the current extent. The end < ee_block case - * is handled in ext4_ext_rm_leaf(). + * If we're punching, there's an extent to the right. + * If the partial cluster hasn't been set, set it to + * that extent's first cluster and its state to nofree + * so it won't be freed should it contain blocks to be + * removed. If it's already set (tofree/nofree), we're + * retrying and keep the original partial cluster info + * so a cluster marked tofree as a result of earlier + * extent removal is not lost. */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, @@ -3631,7 +3634,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); - if (ext4_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) max_zeroout = 0; /* @@ -4048,18 +4051,8 @@ out: } else allocated = ret; map->m_flags |= EXT4_MAP_NEW; - /* - * if we allocated more blocks than requested - * we need to make sure we unmap the extra block - * allocated. The actual needed block will get - * unmapped later when we find the buffer_head marked - * new. - */ - if (allocated > map->m_len) { - clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len, - allocated - map->m_len); + if (allocated > map->m_len) allocated = map->m_len; - } map->m_len = allocated; map_out: @@ -4818,7 +4811,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * leave it disabled for encrypted inodes for now. This is a * bug we should fix.... */ - if (ext4_encrypted_inode(inode) && + if (IS_ENCRYPTED(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | FALLOC_FL_ZERO_RANGE))) return -EOPNOTSUPP; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 69d65d49837b..98ec11f69cd4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -125,7 +125,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) struct super_block *sb = inode->i_sb; int blockmask = sb->s_blocksize - 1; - if (pos >= i_size_read(inode)) + if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) return 0; if ((pos | iov_iter_alignment(from)) & blockmask) diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index e22dcfab308b..46b24da33a28 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -231,6 +231,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) break; case DX_HASH_HALF_MD4_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; + /* fall through */ case DX_HASH_HALF_MD4: p = name; while (len > 0) { @@ -244,6 +245,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) break; case DX_HASH_TEA_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; + /* fall through */ case DX_HASH_TEA: p = name; while (len > 0) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 7ff14a1adba3..f3e17a8c84b4 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -771,7 +771,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (unlikely(ext4_forced_shutdown(sbi))) return ERR_PTR(-EIO); - if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) && !(i_flags & EXT4_EA_INODE_FL)) { err = fscrypt_get_encryption_info(dir); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index bf7fa1507e81..2024d3fa5504 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1183,18 +1183,21 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } + /* fall through */ case EXT4_IND_BLOCK: nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } + /* fall through */ case EXT4_DIND_BLOCK: nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } + /* fall through */ case EXT4_TIND_BLOCK: ; } @@ -1219,6 +1222,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t offsets[4], offsets2[4]; Indirect chain[4], chain2[4]; Indirect *partial, *partial2; + Indirect *p = NULL, *p2 = NULL; ext4_lblk_t max_block; __le32 nr = 0, nr2 = 0; int n = 0, n2 = 0; @@ -1260,7 +1264,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, } - partial = ext4_find_shared(inode, n, offsets, chain, &nr); + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ @@ -1285,13 +1289,11 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); partial--; } end_range: - partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); if (nr2) { if (partial2 == chain2) { /* @@ -1321,16 +1323,14 @@ end_range: (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); partial2--; } goto do_indirects; } /* Punch happened within the same level (n == n2) */ - partial = ext4_find_shared(inode, n, offsets, chain, &nr); - partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); /* Free top, but only if partial2 isn't its subtree. */ if (nr) { @@ -1387,11 +1387,7 @@ end_range: partial->p + 1, partial2->p, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); - return 0; + goto cleanup; } /* @@ -1406,8 +1402,6 @@ end_range: partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); partial--; } if (partial2 > chain2 && depth2 <= depth) { @@ -1415,11 +1409,21 @@ end_range: (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); partial2--; } } + +cleanup: + while (p && p > chain) { + BUFFER_TRACE(p->bh, "call brelse"); + brelse(p->bh); + p--; + } + while (p2 && p2 > chain2) { + BUFFER_TRACE(p2->bh, "call brelse"); + brelse(p2->bh); + p2--; + } return 0; do_indirects: @@ -1427,30 +1431,33 @@ do_indirects: switch (offsets[0]) { default: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } + /* fall through */ case EXT4_IND_BLOCK: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } + /* fall through */ case EXT4_DIND_BLOCK: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } + /* fall through */ case EXT4_TIND_BLOCK: ; } - return 0; + goto cleanup; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 34d7e0703cc6..b32a57bc5d5d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -391,7 +391,7 @@ void ext4_da_update_reserve_space(struct inode *inode, * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && - (atomic_read(&inode->i_writecount) == 0)) + !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode); } @@ -415,7 +415,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, { int ret; - if (ext4_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); @@ -678,8 +678,6 @@ found: if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { - clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, - map->m_len); ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { @@ -1150,7 +1148,7 @@ int do_journal_get_write_access(handle_t *handle, return ret; } -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) { @@ -1194,7 +1192,6 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); @@ -1217,8 +1214,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, (block_start < from || block_end > to)) { ll_rw_block(REQ_OP_READ, 0, 1, &bh); *wait_bh++ = bh; - decrypt = ext4_encrypted_inode(inode) && - S_ISREG(inode->i_mode); + decrypt = IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } } /* @@ -1303,7 +1299,7 @@ retry_journal: /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(page, pos, len, ext4_get_block_unwritten); @@ -2490,10 +2486,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) } BUG_ON(map->m_len == 0); - if (map->m_flags & EXT4_MAP_NEW) { - clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, - map->m_len); - } return 0; } @@ -2836,12 +2828,12 @@ retry: goto unplug; } ret = mpage_prepare_extent_to_map(&mpd); + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); ext4_put_io_end_defer(mpd.io_submit.io_end); mpd.io_submit.io_end = NULL; - /* Unlock pages we didn't use */ - mpage_release_unused_pages(&mpd, false); if (ret < 0) goto unplug; @@ -2909,10 +2901,11 @@ retry: handle = NULL; mpd.do_map = 0; } - /* Submit prepared bio */ - ext4_io_submit(&mpd.io_submit); /* Unlock pages we didn't use */ mpage_release_unused_pages(&mpd, give_up_on_write); + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if @@ -3105,7 +3098,7 @@ retry_journal: /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(page, pos, len, ext4_da_get_block_prep); #else @@ -3880,8 +3873,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t offset = iocb->ki_pos; ssize_t ret; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) +#ifdef CONFIG_FS_ENCRYPTION + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return 0; #endif @@ -4065,8 +4058,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) goto unlock; - if (S_ISREG(inode->i_mode) && - ext4_encrypted_inode(inode)) { + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_SIZE); @@ -4142,7 +4134,7 @@ static int ext4_block_truncate_page(handle_t *handle, struct inode *inode = mapping->host; /* If we are processing an encrypted inode during orphan list handling */ - if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode)) + if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; blocksize = inode->i_sb->s_blocksize; @@ -4722,7 +4714,7 @@ static bool ext4_should_use_dax(struct inode *inode) return false; if (ext4_has_inline_data(inode)) return false; - if (ext4_encrypted_inode(inode)) + if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; return true; } @@ -5072,7 +5064,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } - if (ext4_encrypted_inode(inode)) { + if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; ext4_set_aops(inode); } else if (ext4_inode_is_fast_symlink(inode)) { @@ -5351,7 +5343,6 @@ static int ext4_do_update_inode(handle_t *handle, err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) goto out_brelse; - ext4_update_dynamic_rev(sb); ext4_set_feature_large_file(sb); ext4_handle_sync(handle); err = ext4_handle_dirty_super(handle, sb); @@ -6002,7 +5993,7 @@ int ext4_expand_extra_isize(struct inode *inode, ext4_write_lock_xattr(inode, &no_expand); - BUFFER_TRACE(iloc.bh, "get_write_access"); + BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { brelse(iloc->bh); @@ -6089,36 +6080,6 @@ out: return; } -#if 0 -/* - * Bind an inode's backing buffer_head into this transaction, to prevent - * it from being flushed to disk early. Unlike - * ext4_reserve_inode_write, this leaves behind no bh reference and - * returns no iloc structure, so the caller needs to repeat the iloc - * lookup to mark the inode dirty later. - */ -static int ext4_pin_inode(handle_t *handle, struct inode *inode) -{ - struct ext4_iloc iloc; - - int err = 0; - if (handle) { - err = ext4_get_inode_loc(inode, &iloc); - if (!err) { - BUFFER_TRACE(iloc.bh, "get_write_access"); - err = jbd2_journal_get_write_access(handle, iloc.bh); - if (!err) - err = ext4_handle_dirty_metadata(handle, - NULL, - iloc.bh); - brelse(iloc.bh); - } - } - ext4_std_error(inode->i_sb, err); - return err; -} -#endif - int ext4_change_inode_journal_flag(struct inode *inode, int val) { journal_t *journal; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d37dafa1d133..bab3da4f1e0d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -63,18 +63,20 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) loff_t isize; struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; + unsigned long tmp; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); - swap(inode1->i_blocks, inode2->i_blocks); - swap(inode1->i_bytes, inode2->i_bytes); swap(inode1->i_atime, inode2->i_atime); swap(inode1->i_mtime, inode2->i_mtime); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); - swap(ei1->i_flags, ei2->i_flags); + tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; + ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) | + (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP); + ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP); swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); @@ -115,28 +117,42 @@ static long swap_inode_boot_loader(struct super_block *sb, int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; - - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || - IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || - ext4_has_inline_data(inode)) - return -EINVAL; - - if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || - !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) - return -EPERM; + qsize_t size, size_bl, diff; + blkcnt_t blocks; + unsigned short bytes; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); - filemap_flush(inode->i_mapping); - filemap_flush(inode_bl->i_mapping); - /* Protect orig inodes against a truncate and make sure, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) || + ext4_has_inline_data(inode)) { + err = -EINVAL; + goto journal_err_out; + } + + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto journal_err_out; + } + + down_write(&EXT4_I(inode)->i_mmap_sem); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto err_out; + + err = filemap_write_and_wait(inode_bl->i_mapping); + if (err) + goto err_out; + /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); @@ -147,7 +163,7 @@ static long swap_inode_boot_loader(struct super_block *sb, handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; - goto journal_err_out; + goto err_out; } /* Protect extent tree against block allocations via delalloc */ @@ -170,6 +186,13 @@ static long swap_inode_boot_loader(struct super_block *sb, memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); } + err = dquot_initialize(inode); + if (err) + goto err_out1; + + size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes; + size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes; + diff = size - size_bl; swap_inode_data(inode, inode_bl); inode->i_ctime = inode_bl->i_ctime = current_time(inode); @@ -183,34 +206,58 @@ static long swap_inode_boot_loader(struct super_block *sb, err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { + /* No need to update quota information. */ ext4_warning(inode->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); - } else { - err = ext4_mark_inode_dirty(handle, inode_bl); - if (err < 0) { - ext4_warning(inode_bl->i_sb, - "couldn't mark inode #%lu dirty (err %d)", - inode_bl->i_ino, err); - /* Revert all changes: */ - swap_inode_data(inode, inode_bl); - ext4_mark_inode_dirty(handle, inode); - ext4_mark_inode_dirty(handle, inode_bl); - } + goto err_out1; } + + blocks = inode_bl->i_blocks; + bytes = inode_bl->i_bytes; + inode_bl->i_blocks = inode->i_blocks; + inode_bl->i_bytes = inode->i_bytes; + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + /* No need to update quota information. */ + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + goto revert; + } + + /* Bootloader inode should not be counted into quota information. */ + if (diff > 0) + dquot_free_space(inode, diff); + else + err = dquot_alloc_space(inode, -1 * diff); + + if (err < 0) { +revert: + /* Revert all changes: */ + inode_bl->i_blocks = blocks; + inode_bl->i_bytes = bytes; + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); + } + +err_out1: ext4_journal_stop(handle); ext4_double_up_write_data_sem(inode, inode_bl); +err_out: + up_write(&EXT4_I(inode)->i_mmap_sem); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); return err; } -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION static int uuid_is_zero(__u8 u[16]) { int i; @@ -953,6 +1000,13 @@ resizefs_out: if (!blk_queue_discard(q)) return -EOPNOTSUPP; + /* + * We haven't replayed the journal, so we cannot use our + * block-bitmap-guided storage zapping commands. + */ + if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) + return -EROFS; + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; @@ -978,7 +1032,7 @@ resizefs_out: return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); case EXT4_IOC_GET_ENCRYPTION_PWSALT: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION int err, err2; struct ext4_sb_info *sbi = EXT4_SB(sb); handle_t *handle; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e2248083cdca..6fb76d408093 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4176,9 +4176,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; - if ((size == isize) && - !ext4_fs_is_busy(sbi) && - (atomic_read(&ac->ac_inode->i_writecount) == 0)) { + if ((size == isize) && !ext4_fs_is_busy(sbi) && + !inode_is_open_for_write(ac->ac_inode)) { ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } @@ -4258,7 +4257,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, - atomic_read(&ar->inode->i_writecount) ? "" : "non-"); + inode_is_open_for_write(ar->inode) ? "" : "non-"); return 0; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2f5be02fc6f6..1083a9f3f16a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -592,8 +592,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, return -EOPNOTSUPP; } - if (ext4_encrypted_inode(orig_inode) || - ext4_encrypted_inode(donor_inode)) { + if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { ext4_msg(orig_inode->i_sb, KERN_ERR, "Online defrag not supported for encrypted files"); return -EOPNOTSUPP; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2b928eb07fa2..980166a8122a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -612,7 +612,7 @@ static struct stats dx_show_leaf(struct inode *dir, { if (show_names) { -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION int len; char *name; struct fscrypt_str fname_crypto_str = @@ -621,7 +621,7 @@ static struct stats dx_show_leaf(struct inode *dir, name = de->name; len = de->name_len; - if (ext4_encrypted_inode(dir)) + if (IS_ENCRYPTED(dir)) res = fscrypt_get_encryption_info(dir); if (res) { printk(KERN_WARNING "Error setting up" @@ -984,9 +984,9 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION /* Check if the directory is encrypted */ - if (ext4_encrypted_inode(dir)) { + if (IS_ENCRYPTED(dir)) { err = fscrypt_get_encryption_info(dir); if (err < 0) { brelse(bh); @@ -1015,7 +1015,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - if (!ext4_encrypted_inode(dir)) { + if (!IS_ENCRYPTED(dir)) { tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, @@ -1047,7 +1047,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, } errout: brelse(bh); -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION fscrypt_fname_free_buffer(&fname_crypto_str); #endif return count; @@ -1267,7 +1267,7 @@ static inline bool ext4_match(const struct ext4_filename *fname, f.usr_fname = fname->usr_fname; f.disk_name = fname->disk_name; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION f.crypto_buf = fname->crypto_buf; #endif return fscrypt_match_name(&f, de->name, de->name_len); @@ -1498,7 +1498,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ext4_lblk_t block; int retval; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION *res_dir = NULL; #endif frame = dx_probe(fname, dir, NULL, frames); @@ -1578,7 +1578,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi ino); return ERR_PTR(-EFSCORRUPTED); } - if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && + if (!IS_ERR(inode) && IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { ext4_warning(inode->i_sb, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index cff4c4aa7a9c..3e9298e6a705 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -67,7 +67,7 @@ static void ext4_finish_bio(struct bio *bio) bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION struct page *data_page = NULL; #endif struct buffer_head *bh, *head; @@ -79,7 +79,7 @@ static void ext4_finish_bio(struct bio *bio) if (!page) continue; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION if (!page->mapping) { /* The bounce data pages are unmapped. */ data_page = page; @@ -112,7 +112,7 @@ static void ext4_finish_bio(struct bio *bio) bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); local_irq_restore(flags); if (!under_io) { -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION if (data_page) fscrypt_restore_control_page(data_page); #endif @@ -468,18 +468,15 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ext4_io_submit(io); continue; } - if (buffer_new(bh)) { + if (buffer_new(bh)) clear_buffer_new(bh); - clean_bdev_bh_alias(bh); - } set_buffer_async_write(bh); nr_to_submit++; } while ((bh = bh->b_this_page) != head); bh = head = page_buffers(page); - if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && - nr_to_submit) { + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) { gfp_t gfp_flags = GFP_NOFS; retry_encrypt: diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index e53639784892..3adadf461825 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -49,7 +49,7 @@ static inline bool ext4_bio_encrypted(struct bio *bio) { -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION return unlikely(bio->bi_private != NULL); #else return false; @@ -243,8 +243,7 @@ int ext4_mpage_readpages(struct address_space *mapping, if (bio == NULL) { struct fscrypt_ctx *ctx = NULL; - if (ext4_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) { ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) goto set_error_page; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 48421de803b7..e7ae26e36c9c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -932,11 +932,18 @@ static int add_new_gdb_meta_bg(struct super_block *sb, memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); n_group_desc[gdb_num] = gdb_bh; + + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdb_bh); + if (err) { + kvfree(n_group_desc); + brelse(gdb_bh); + return err; + } + EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; kvfree(o_group_desc); - BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); return err; } @@ -1960,7 +1967,8 @@ retry: le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); n_blocks_count = (ext4_fsblk_t)n_group * - EXT4_BLOCKS_PER_GROUP(sb); + EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(es->s_first_data_block); n_group--; /* set to last group number */ } @@ -2072,6 +2080,10 @@ out: free_flex_gd(flex_gd); if (resize_inode != NULL) iput(resize_inode); - ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count); + if (err) + ext4_warning(sb, "error (%d) occurred during " + "file system resize", err); + ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", + ext4_blocks_count(es)); return err; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fb12d3c17c1b..6ed4eb81e674 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -430,6 +430,12 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) spin_unlock(&sbi->s_md_lock); } +static bool system_going_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -460,7 +466,12 @@ static void ext4_handle_error(struct super_block *sb) if (journal) jbd2_journal_abort(journal, -EIO); } - if (test_opt(sb, ERRORS_RO)) { + /* + * We force ERRORS_RO behavior when system is rebooting. Otherwise we + * could panic during 'reboot -f' as the underlying device got already + * disabled. + */ + if (test_opt(sb, ERRORS_RO) || system_going_down()) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible @@ -468,8 +479,7 @@ static void ext4_handle_error(struct super_block *sb) */ smp_wmb(); sb->s_flags |= SB_RDONLY; - } - if (test_opt(sb, ERRORS_PANIC)) { + } else if (test_opt(sb, ERRORS_PANIC)) { if (EXT4_SB(sb)->s_journal && !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) return; @@ -1232,7 +1242,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, return try_to_free_buffers(page); } -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION static int ext4_get_context(struct inode *inode, void *ctx, size_t len) { return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, @@ -1922,7 +1932,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); } else if (token == Opt_test_dummy_encryption) { -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION; ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); @@ -2249,7 +2259,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); - ext4_update_dynamic_rev(sb); if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); @@ -4167,7 +4176,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif #ifdef CONFIG_QUOTA diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 9212a026a1f1..616c075da062 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -30,6 +30,7 @@ typedef enum { attr_feature, attr_pointer_ui, attr_pointer_atomic, + attr_journal_task, } attr_id_t; typedef enum { @@ -125,6 +126,14 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi, return count; } +static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) +{ + if (!sbi->s_journal) + return snprintf(buf, PAGE_SIZE, "<none>\n"); + return snprintf(buf, PAGE_SIZE, "%d\n", + task_pid_vnr(sbi->s_journal->j_task)); +} + #define EXT4_ATTR(_name,_mode,_id) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -188,6 +197,7 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); +EXT4_ATTR(journal_task, 0444, journal_task); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -217,6 +227,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(errors_count), ATTR_LIST(first_error_time), ATTR_LIST(last_error_time), + ATTR_LIST(journal_task), NULL, }; @@ -224,7 +235,7 @@ static struct attribute *ext4_attrs[] = { EXT4_ATTR_FEATURE(lazy_itable_init); EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); @@ -233,7 +244,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), ATTR_LIST(meta_bg_resize), -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), #endif ATTR_LIST(metadata_csum_seed), @@ -304,6 +315,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return print_tstamp(buf, sbi->s_es, s_first_error_time); case attr_last_error_time: return print_tstamp(buf, sbi->s_es, s_last_error_time); + case attr_journal_task: + return journal_task_show(sbi, buf); } return 0; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 86ed9c686249..dc82e7757f67 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -829,6 +829,7 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { ret = PTR_ERR(bh); + bh = NULL; goto out; } @@ -2903,6 +2904,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (error == -EIO) EXT4_ERROR_INODE(inode, "block %llu read error", EXT4_I(inode)->i_file_acl); + bh = NULL; goto cleanup; } error = ext4_xattr_check_block(inode, bh); @@ -3059,6 +3061,7 @@ ext4_xattr_block_cache_find(struct inode *inode, if (IS_ERR(bh)) { if (PTR_ERR(bh) == -ENOMEM) return NULL; + bh = NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 9a20ef42fadd..e57cc754d543 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -3,6 +3,7 @@ config F2FS_FS depends on BLOCK select CRYPTO select CRYPTO_CRC32 + select F2FS_FS_XATTR if FS_ENCRYPTION help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -70,17 +71,6 @@ config F2FS_CHECK_FS If you want to improve the performance, say N. -config F2FS_FS_ENCRYPTION - bool "F2FS Encryption" - depends on F2FS_FS - depends on F2FS_FS_XATTR - select FS_ENCRYPTION - help - Enable encryption of f2fs files and directories. This - feature is similar to ecryptfs, but it is more memory - efficient since it avoids caching the encrypted and - decrypted pages in the page cache. - config F2FS_IO_TRACE bool "F2FS IO tracer" depends on F2FS_FS diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f955cd3e0677..a98e1b02279e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -306,8 +306,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* collect a number of dirty meta pages and write together */ - if (wbc->for_kupdate || - get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_META) < + nr_pages_to_skip(sbi, META)) goto skip_write; /* if locked failed, cp will flush dirty pages instead */ @@ -405,7 +406,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } @@ -956,7 +957,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); } @@ -1259,10 +1260,17 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) else __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK)) + __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); - else - __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + /* + * TODO: we count on fsck.f2fs to clear this flag until we figure out + * missing cases which clear it incorrectly. + */ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index da060b77f64d..9727944139f2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -301,9 +301,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, for (; start < F2FS_IO_SIZE(sbi); start++) { struct page *page = mempool_alloc(sbi->write_io_dummy, - GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + GFP_NOIO | __GFP_NOFAIL); f2fs_bug_on(sbi, !page); + zero_user_segment(page, 0, PAGE_SIZE); SetPagePrivate(page); set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); lock_page(page); @@ -1468,7 +1469,7 @@ next: } if (size) { - if (f2fs_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; ret = fiemap_fill_next_extent(fieinfo, logical, @@ -1553,6 +1554,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping, if (last_block > last_block_in_file) last_block = last_block_in_file; + /* just zeroing out page which is beyond EOF */ + if (block_in_file >= last_block) + goto zero_out; /* * Map blocks using the previous result first. */ @@ -1565,16 +1569,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping, * Then do more f2fs_map_blocks() calls until we are * done with this page. */ - map.m_flags = 0; - - if (block_in_file < last_block) { - map.m_lblk = block_in_file; - map.m_len = last_block - block_in_file; + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; - if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_DEFAULT)) - goto set_error_page; - } + if (f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT)) + goto set_error_page; got_it: if ((map.m_flags & F2FS_MAP_MAPPED)) { block_nr = map.m_pblk + block_in_file - map.m_lblk; @@ -1589,6 +1588,7 @@ got_it: DATA_GENERIC)) goto set_error_page; } else { +zero_out: zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); @@ -1739,7 +1739,7 @@ static inline bool check_inplace_update_policy(struct inode *inode, if (policy & (0x1 << F2FS_IPU_ASYNC) && fio && fio->op == REQ_OP_WRITE && !(fio->op_flags & REQ_SYNC) && - !f2fs_encrypted_inode(inode)) + !IS_ENCRYPTED(inode)) return true; /* this is only set during fdatasync */ @@ -1863,8 +1863,13 @@ got_it: if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); - if (err && PageWriteback(page)) - end_page_writeback(page); + if (err) { + if (f2fs_encrypted_file(inode)) + fscrypt_pullback_bio_page(&fio->encrypted_page, + true); + if (PageWriteback(page)) + end_page_writeback(page); + } trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); return err; @@ -2315,7 +2320,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - f2fs_truncate_blocks(inode, i_size, true, true); + if (!IS_NOQUOTA(inode)) + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -2585,14 +2591,11 @@ static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode, { struct f2fs_private_dio *dio; bool write = (bio_op(bio) == REQ_OP_WRITE); - int err; dio = f2fs_kzalloc(F2FS_I_SB(inode), sizeof(struct f2fs_private_dio), GFP_NOFS); - if (!dio) { - err = -ENOMEM; + if (!dio) goto out; - } dio->inode = inode; dio->orig_end_io = bio->bi_end_io; @@ -2710,12 +2713,10 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_cold_data(page); - /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); } int f2fs_release_page(struct page *page, gfp_t wait) @@ -2729,8 +2730,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; clear_cold_data(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); return 1; } @@ -2798,12 +2798,8 @@ int f2fs_migrate_page(struct address_space *mapping, return -EAGAIN; } - /* - * A reference is expected if PagePrivate set when move mapping, - * however F2FS breaks this for maintaining dirty page counts when - * truncating pages. So here adjusting the 'extra_count' make it work. - */ - extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + /* one extra reference was held for atomic_write page */ + extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, mode, extra_count); if (rc != MIGRATEPAGE_SUCCESS) { @@ -2824,9 +2820,10 @@ int f2fs_migrate_page(struct address_space *mapping, get_page(newpage); } - if (PagePrivate(page)) - SetPagePrivate(newpage); - set_page_private(newpage, page_private(page)); + if (PagePrivate(page)) { + f2fs_set_page_private(newpage, page_private(page)); + f2fs_clear_page_private(page); + } if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fd7f170e2f2d..99e9a5c37b71 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -96,8 +96,10 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->free_secs = free_sections(sbi); si->prefree_count = prefree_segments(sbi); si->dirty_count = dirty_segments(sbi); - si->node_pages = NODE_MAPPING(sbi)->nrpages; - si->meta_pages = META_MAPPING(sbi)->nrpages; + if (sbi->node_inode) + si->node_pages = NODE_MAPPING(sbi)->nrpages; + if (sbi->meta_inode) + si->meta_pages = META_MAPPING(sbi)->nrpages; si->nats = NM_I(sbi)->nat_cnt; si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); @@ -175,7 +177,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) static void update_mem_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned npages; int i; if (si->base_mem) @@ -258,10 +259,14 @@ get_cache: sizeof(struct extent_node); si->page_mem = 0; - npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_SHIFT; - npages = META_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + if (sbi->node_inode) { + unsigned npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } + if (sbi->meta_inode) { + unsigned npages = META_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } } static int stat_show(struct seq_file *s, void *v) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 50d0d36280fa..59bc46017855 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -385,7 +385,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, if (err) goto put_error; - if ((f2fs_encrypted_inode(dir) || dummy_encrypt) && + if ((IS_ENCRYPTED(dir) || dummy_encrypt) && f2fs_may_encrypt(inode)) { err = fscrypt_inherit_context(dir, inode, page, false); if (err) @@ -399,7 +399,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, if (new_name) { init_dent_inode(new_name, page); - if (f2fs_encrypted_inode(dir)) + if (IS_ENCRYPTED(dir)) file_set_enc_name(inode); } @@ -728,7 +728,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); - ClearPagePrivate(page); + f2fs_clear_page_private(page); ClearPageUptodate(page); clear_cold_data(page); inode_dec_dirty_pages(dir); @@ -800,6 +800,10 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (de->name_len == 0) { bit_pos++; ctx->pos = start_pos + bit_pos; + printk_ratelimited( + "%s, invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } @@ -810,7 +814,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, /* check memory boundary before moving forward */ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - if (unlikely(bit_pos > d->max)) { + if (unlikely(bit_pos > d->max || + le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { f2fs_msg(sbi->sb, KERN_WARNING, "%s: corrupted namelen=%d, run fsck to fix.", __func__, le16_to_cpu(de->name_len)); @@ -819,7 +824,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, goto out; } - if (f2fs_encrypted_inode(d->inode)) { + if (IS_ENCRYPTED(d->inode)) { int save_len = fstr->len; err = fscrypt_fname_disk_to_usr(d->inode, @@ -862,7 +867,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct fscrypt_str fstr = FSTR_INIT(NULL, 0); int err = 0; - if (f2fs_encrypted_inode(inode)) { + if (IS_ENCRYPTED(inode)) { err = fscrypt_get_encryption_info(inode); if (err && err != -ENOKEY) goto out; @@ -891,7 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_get_lock_data_page(inode, n, false); + dentry_page = f2fs_find_data_page(inode, n); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { @@ -909,11 +914,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); break; } - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); } out_free: fscrypt_fname_free_buffer(&fstr); @@ -924,7 +929,7 @@ out: static int f2fs_dir_open(struct inode *inode, struct file *filp) { - if (f2fs_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) return fscrypt_get_encryption_info(inode) ? -EACCES : 0; return 0; } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 1cb0fcc67d2d..caf77fe8ac07 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -506,7 +506,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, unsigned int end = fofs + len; unsigned int pos = (unsigned int)fofs; bool updated = false; - bool leftmost; + bool leftmost = false; if (!et) return; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 02d0cd199828..bacf5c2a8850 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -24,7 +24,6 @@ #include <linux/quotaops.h> #include <crypto/hash.h> -#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION) #include <linux/fscrypt.h> #ifdef CONFIG_F2FS_CHECK_FS @@ -191,6 +190,8 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ +#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ struct cp_control { int reason; @@ -254,7 +255,7 @@ struct discard_entry { /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ - (MAX_PLIST_NUM - 1) : (blk_num - 1)) + (MAX_PLIST_NUM - 1) : ((blk_num) - 1)) enum { D_PREP, /* initial */ @@ -310,6 +311,7 @@ struct discard_policy { bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ unsigned int granularity; /* discard granularity */ + int timeout; /* discard timeout for put_super */ }; struct discard_cmd_control { @@ -456,7 +458,6 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 -#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ @@ -1099,6 +1100,7 @@ enum { SBI_IS_SHUTDOWN, /* shutdown by ioctl */ SBI_IS_RECOVERED, /* recovered orphan/data */ SBI_CP_DISABLED, /* CP was disabled last mount */ + SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ @@ -1110,6 +1112,7 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, + UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; @@ -1137,7 +1140,7 @@ enum fsync_mode { FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) #else @@ -1238,8 +1241,6 @@ struct f2fs_sb_info { unsigned int nquota_files; /* # of quota sysfile */ - u32 s_next_generation; /* for NFS support */ - /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ @@ -1421,7 +1422,6 @@ static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc, BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); desc.shash.tfm = sbi->s_chksum_driver; - desc.shash.flags = 0; *(u32 *)desc.ctx = crc; err = crypto_shash_update(&desc.shash, address, length); @@ -1799,13 +1799,12 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || - count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA || - count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE || - count_type == F2FS_RD_META) - return; - - set_sbi_flag(sbi, SBI_IS_DIRTY); + if (count_type == F2FS_DIRTY_DENTS || + count_type == F2FS_DIRTY_NODES || + count_type == F2FS_DIRTY_META || + count_type == F2FS_DIRTY_QDATA || + count_type == F2FS_DIRTY_IMETA) + set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) @@ -2157,10 +2156,17 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || get_pages(sbi, F2FS_WB_CP_DATA) || get_pages(sbi, F2FS_DIO_READ) || - get_pages(sbi, F2FS_DIO_WRITE) || - atomic_read(&SM_I(sbi)->dcc_info->queued_discard) || - atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + get_pages(sbi, F2FS_DIO_WRITE)) + return false; + + if (SM_I(sbi) && SM_I(sbi)->dcc_info && + atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) + return false; + + if (SM_I(sbi) && SM_I(sbi)->fcc_info && + atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) return false; + return f2fs_time_over(sbi, type); } @@ -2301,11 +2307,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define F2FS_NOCOW_FL 0x00800000 /* Do not cow file */ #define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define F2FS_FL_USER_VISIBLE 0x30CBDFFF /* User visible flags */ #define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ /* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */ @@ -2762,9 +2769,9 @@ static inline int get_inline_xattr_addrs(struct inode *inode) #define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) #define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ - ((offsetof(typeof(*f2fs_inode), field) + \ + ((offsetof(typeof(*(f2fs_inode)), field) + \ sizeof((f2fs_inode)->field)) \ - <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) { @@ -2793,8 +2800,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, #define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) -#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ - (!is_read_io(fio->op) || fio->is_meta)) +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META && \ + (!is_read_io((fio)->op) || (fio)->is_meta)) bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); @@ -2826,13 +2833,33 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, return true; } +static inline void f2fs_set_page_private(struct page *page, + unsigned long data) +{ + if (PagePrivate(page)) + return; + + get_page(page); + SetPagePrivate(page); + set_page_private(page, data); +} + +static inline void f2fs_clear_page_private(struct page *page) +{ + if (!PagePrivate(page)) + return; + + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); +} + /* * file.c */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void f2fs_truncate_data_blocks(struct dnode_of_data *dn); -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); @@ -3006,7 +3033,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); @@ -3463,19 +3490,14 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); /* * crypto support */ -static inline bool f2fs_encrypted_inode(struct inode *inode) -{ - return file_is_encrypt(inode); -} - static inline bool f2fs_encrypted_file(struct inode *inode) { - return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); + return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } static inline void f2fs_set_encrypted_inode(struct inode *inode) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION file_set_encrypt(inode); f2fs_set_inode_flags(inode); #endif @@ -3554,7 +3576,7 @@ static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) static inline bool f2fs_may_encrypt(struct inode *inode) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION umode_t mode = inode->i_mode; return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); @@ -3616,8 +3638,6 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, #define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) #endif -#endif - static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) { #ifdef CONFIG_QUOTA @@ -3630,3 +3650,5 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) #endif return false; } + +#endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bba56b39dcc5..5742ab8b57dc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -582,15 +582,14 @@ truncate_out: zero_user(page, offset, PAGE_SIZE - offset); /* An encrypted inode should have a key and truncate the last page. */ - f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode)); + f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode)); if (!cache_only) set_page_dirty(page); f2fs_put_page(page, 1); return 0; } -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -598,7 +597,6 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, int count = 0, err = 0; struct page *ipage; bool truncate_page = false; - int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; trace_f2fs_truncate_blocks_enter(inode, from); @@ -608,7 +606,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, goto free_partial; if (lock) - __do_map_lock(sbi, flag, true); + f2fs_lock_op(sbi); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { @@ -646,7 +644,7 @@ free_next: err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) - __do_map_lock(sbi, flag, false); + f2fs_unlock_op(sbi); free_partial: /* lastly zero out the first data page */ if (!err) @@ -681,7 +679,7 @@ int f2fs_truncate(struct inode *inode) return err; } - err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (err) return err; @@ -711,7 +709,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, stat->attributes |= STATX_ATTR_APPEND; if (flags & F2FS_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; - if (f2fs_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) stat->attributes |= STATX_ATTR_ENCRYPTED; if (flags & F2FS_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; @@ -768,7 +766,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; - bool size_changed = false; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -843,8 +840,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) down_write(&F2FS_I(inode)->i_sem); F2FS_I(inode)->last_disk_size = i_size_read(inode); up_write(&F2FS_I(inode)->i_sem); - - size_changed = true; } __setattr_copy(inode, attr); @@ -858,7 +853,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } /* file size may changed here */ - f2fs_mark_inode_dirty_sync(inode, size_changed); + f2fs_mark_inode_dirty_sync(inode, true); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -1262,7 +1257,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) new_size = i_size_read(inode) - len; truncate_pagecache(inode, new_size); - ret = f2fs_truncate_blocks(inode, new_size, true, false); + ret = f2fs_truncate_blocks(inode, new_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); @@ -1447,7 +1442,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); down_write(&F2FS_I(inode)->i_mmap_sem); - ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) return ret; @@ -1563,7 +1558,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (f2fs_encrypted_inode(inode) && + if (IS_ENCRYPTED(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; @@ -1647,10 +1642,12 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags = fi->i_flags; - if (f2fs_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) flags |= F2FS_ENCRYPT_FL; if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) flags |= F2FS_INLINE_DATA_FL; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + flags |= F2FS_NOCOW_FL; flags &= F2FS_FL_USER_VISIBLE; @@ -1750,10 +1747,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (!get_dirty_pages(inode)) - goto skip_flush; - - f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + /* + * Should wait end_io to count F2FS_WB_CP_DATA correctly by + * f2fs_is_atomic_file. + */ + if (get_dirty_pages(inode)) + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); @@ -1761,7 +1760,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } -skip_flush: + set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1968,11 +1967,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) break; case F2FS_GOING_DOWN_NEED_FSCK: set_sbi_flag(sbi, SBI_NEED_FSCK); + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + set_sbi_flag(sbi, SBI_IS_DIRTY); /* do checkpoint only */ ret = f2fs_sync_fs(sb, 1); - if (ret) - goto out; - break; + goto out; default: ret = -EINVAL; goto out; @@ -1988,6 +1987,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) out: if (in != F2FS_GOING_DOWN_FULLSYNC) mnt_drop_write_file(filp); + + trace_f2fs_shutdown(sbi, in, ret); + return ret; } @@ -2414,7 +2416,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) return -EINVAL; - if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) + if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst)) return -EOPNOTSUPP; if (src == dst) { @@ -2871,8 +2873,8 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) __u32 pin; int ret = 0; - if (!inode_owner_or_capable(inode)) - return -EACCES; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (get_user(pin, (__u32 __user *)arg)) return -EFAULT; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index d636cbcf68f2..bb6a152310ef 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -298,7 +298,7 @@ process_inline: clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (f2fs_truncate_blocks(inode, 0, false, false)) + if (f2fs_truncate_blocks(inode, 0, false)) return false; goto process_inline; } @@ -470,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - f2fs_truncate_blocks(dir, 0, false, false); + f2fs_truncate_blocks(dir, 0, false); f2fs_remove_dirty_inode(dir); return err; } @@ -659,6 +659,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (IS_ERR(ipage)) return PTR_ERR(ipage); + /* + * f2fs_readdir was protected by inode.i_rwsem, it is safe to access + * ipage without page's lock held. + */ + unlock_page(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -667,7 +673,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (!err) ctx->pos = d.max; - f2fs_put_page(ipage, 1); + f2fs_put_page(ipage, 0); return err < 0 ? err : 0; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bec52961630b..e7f2e8759315 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -14,6 +14,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include <trace/events/f2fs.h> @@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & F2FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (f2fs_encrypted_inode(inode)) + if (file_is_encrypt(inode)) new_fl |= S_ENCRYPTED; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| @@ -248,6 +249,20 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_flexible_inline_xattr(sbi) && + f2fs_has_inline_xattr(inode) && + (!fi->i_inline_xattr_size || + fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) has corrupted " + "i_inline_xattr_size: %d, max: %zu", + __func__, inode->i_ino, fi->i_inline_xattr_size, + MAX_INLINE_XATTR_SIZE); + return false; + } + if (F2FS_I(inode)->extent_tree) { struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; @@ -453,7 +468,7 @@ make_now: inode->i_mapping->a_ops = &f2fs_dblock_aops; inode_nohighmem(inode); } else if (S_ISLNK(inode->i_mode)) { - if (f2fs_encrypted_inode(inode)) + if (file_is_encrypt(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 62d9829f3a6a..f5e34e467003 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/sched.h> #include <linux/ctype.h> +#include <linux/random.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/quotaops.h> @@ -50,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); F2FS_I(inode)->i_crtime = inode->i_mtime; - inode->i_generation = sbi->s_next_generation++; + inode->i_generation = prandom_u32(); if (S_ISDIR(inode->i_mode)) F2FS_I(inode)->i_current_depth = 1; @@ -75,7 +76,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); /* If the directory encrypted, then we should encrypt the inode. */ - if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); @@ -476,7 +477,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (err) goto out_iput; } - if (f2fs_encrypted_inode(dir) && + if (IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { f2fs_msg(inode->i_sb, KERN_WARNING, @@ -803,7 +804,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { + if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { int err = fscrypt_get_encryption_info(dir); if (err) return err; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4f450e573312..3f99ab288695 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1920,7 +1920,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_NODES) < + nr_pages_to_skip(sbi, NODE)) goto skip_write; if (wbc->sync_mode == WB_SYNC_ALL) @@ -1959,7 +1961,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9b79056d705d..aa7fe79b62b2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -191,8 +191,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); - set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); - SetPagePrivate(page); + f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -215,7 +214,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover) + struct list_head *head, bool drop, bool recover, + bool trylock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -227,7 +227,16 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - lock_page(page); + if (trylock) { + /* + * to avoid deadlock in between page lock and + * inmem_lock. + */ + if (!trylock_page(page)) + continue; + } else { + lock_page(page); + } f2fs_wait_on_page_writeback(page, DATA, true, true); @@ -270,8 +279,7 @@ next: ClearPageUptodate(page); clear_cold_data(page); } - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 1); list_del(&cur->list); @@ -318,13 +326,19 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); + while (!list_empty(&fi->inmem_pages)) { + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, true); + + if (list_empty(&fi->inmem_pages)) { + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } + mutex_unlock(&fi->inmem_lock); + } clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -354,8 +368,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) kmem_cache_free(inmem_entry_slab, cur); ClearPageUptodate(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 0); trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); @@ -429,12 +442,15 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, false, true); + err = __revoke_inmem_pages(inode, &revoke_list, + false, true, false); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, false); } else { - __revoke_inmem_pages(inode, &revoke_list, false, false); + __revoke_inmem_pages(inode, &revoke_list, + false, false, false); } return err; @@ -542,9 +558,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio = f2fs_bio_alloc(sbi, 0, true); + struct bio *bio; int ret; + bio = f2fs_bio_alloc(sbi, 0, false); + if (!bio) + return -ENOMEM; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; bio_set_dev(bio, bdev); ret = submit_bio_wait(bio); @@ -868,6 +888,9 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi) if (holes[DATA] > ovp || holes[NODE] > ovp) return -EAGAIN; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && + dirty_segments(sbi) > overprovision_segments(sbi)) + return -EAGAIN; return 0; } @@ -1037,6 +1060,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->timeout = 0; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1059,6 +1083,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->max_requests = UINT_MAX; dpolicy->io_aware = false; + /* we need to issue all to keep CP_TRIMMED_FLAG */ + dpolicy->granularity = 1; } } @@ -1424,7 +1450,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, int i, issued = 0; bool io_interrupted = false; + if (dpolicy->timeout != 0) + f2fs_update_time(sbi, dpolicy->timeout); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (dpolicy->timeout != 0 && + f2fs_time_over(sbi, dpolicy->timeout)) + break; + if (i + 1 < dpolicy->granularity) break; @@ -1611,7 +1644,7 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super */ -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; @@ -1619,6 +1652,7 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT; __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); @@ -3164,10 +3198,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); - if (!err) + if (!err) { update_device_state(fio); - - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + } return err; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a77f76f528b6..5c7ed0442d6e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -865,7 +865,7 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) } } mutex_unlock(&dcc->cmd_lock); - if (!wakeup) + if (!wakeup || !is_idle(sbi, DISCARD_TIME)) return; wake_up: dcc->discard_wake = 1; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3d3ce9eb6d13..f2aaa2cc6b3e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -269,7 +269,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, if (!qname) { f2fs_msg(sb, KERN_ERR, "Not enough memory for storing quotafile name"); - return -EINVAL; + return -ENOMEM; } if (F2FS_OPTION(sbi).s_qf_names[qtype]) { if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) @@ -586,7 +586,7 @@ static int parse_options(struct super_block *sb, char *options) case Opt_io_size_bits: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) { f2fs_msg(sb, KERN_WARNING, "Not support %d, larger than %d", 1 << arg, BIO_MAX_PAGES); @@ -757,7 +757,7 @@ static int parse_options(struct super_block *sb, char *options) kvfree(name); break; case Opt_test_dummy_encryption: -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION if (!f2fs_sb_has_encrypt(sbi)) { f2fs_msg(sb, KERN_ERR, "Encrypt feature is off"); return -EINVAL; @@ -821,6 +821,8 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { + int min_size, max_size; + if (!f2fs_sb_has_extra_attr(sbi) || !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_msg(sb, KERN_ERR, @@ -834,14 +836,15 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size >= - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE - - DEF_INLINE_RESERVED_SIZE - - DEF_MIN_INLINE_SIZE) { + + min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); + max_size = MAX_INLINE_XATTR_SIZE; + + if (F2FS_OPTION(sbi).inline_xattr_size < min_size || + F2FS_OPTION(sbi).inline_xattr_size > max_size) { f2fs_msg(sb, KERN_ERR, - "inline xattr size is out of range"); + "inline xattr size is out of range: %d ~ %d", + min_size, max_size); return -EINVAL; } } @@ -915,6 +918,10 @@ static int f2fs_drop_inode(struct inode *inode) sb_start_intwrite(inode->i_sb); f2fs_i_size_write(inode, 0); + f2fs_submit_merged_write_cond(F2FS_I_SB(inode), + inode, NULL, 0, DATA); + truncate_inode_pages_final(inode->i_mapping); + if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); @@ -1048,7 +1055,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - dropped = f2fs_wait_discard_bios(sbi); + dropped = f2fs_issue_discard_timeout(sbi); if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && !sbi->discard_blks && !dropped) { @@ -1075,7 +1082,10 @@ static void f2fs_put_super(struct super_block *sb) f2fs_bug_on(sbi, sbi->fsync_node_num); iput(sbi->node_inode); + sbi->node_inode = NULL; + iput(sbi->meta_inode); + sbi->meta_inode = NULL; /* * iput() can update stat information, if f2fs_write_checkpoint() @@ -1390,7 +1400,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",whint_mode=%s", "user-based"); else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION if (F2FS_OPTION(sbi).test_dummy_encryption) seq_puts(seq, ",test_dummy_encryption"); #endif @@ -1455,9 +1465,16 @@ static int f2fs_enable_quotas(struct super_block *sb); static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { + unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - int err; + int err = 0; + int ret; + if (s_flags & SB_RDONLY) { + f2fs_msg(sbi->sb, KERN_ERR, + "checkpoint=disable on readonly fs"); + return -EINVAL; + } sbi->sb->s_flags |= SB_ACTIVE; f2fs_update_time(sbi, DISABLE_TIME); @@ -1465,18 +1482,24 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) while (!f2fs_time_over(sbi, DISABLE_TIME)) { mutex_lock(&sbi->gc_mutex); err = f2fs_gc(sbi, true, false, NULL_SEGNO); - if (err == -ENODATA) + if (err == -ENODATA) { + err = 0; break; + } if (err && err != -EAGAIN) - return err; + break; } - err = sync_filesystem(sbi->sb); - if (err) - return err; + ret = sync_filesystem(sbi->sb); + if (ret || err) { + err = ret ? ret: err; + goto restore_flag; + } - if (f2fs_disable_cp_again(sbi)) - return -EAGAIN; + if (f2fs_disable_cp_again(sbi)) { + err = -EAGAIN; + goto restore_flag; + } mutex_lock(&sbi->gc_mutex); cpc.reason = CP_PAUSE; @@ -1485,7 +1508,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) sbi->unusable_block_count = 0; mutex_unlock(&sbi->gc_mutex); - return 0; +restore_flag: + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) @@ -2023,6 +2048,12 @@ void f2fs_quota_off_umount(struct super_block *sb) set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); } } + /* + * In case of checkpoint=disable, we must flush quota blocks. + * This can cause NULL exception for node_inode in end_io, since + * put_super already dropped it. + */ + sync_filesystem(sb); } static void f2fs_truncate_quota_inode_pages(struct super_block *sb) @@ -2154,7 +2185,7 @@ static const struct super_operations f2fs_sops = { .remount_fs = f2fs_remount, }; -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) { return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, @@ -2703,6 +2734,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = + DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); for (i = 0; i < NR_COUNT_TYPE; i++) @@ -3022,10 +3055,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct f2fs_super_block *raw_super; struct inode *root; int err; - bool retry = true, need_fsck = false; + bool skip_recovery = false, need_fsck = false; char *options = NULL; int recovery, i, valid_super_block; struct curseg_info *seg_i; + int retry_cnt = 1; try_onemore: err = -EINVAL; @@ -3097,7 +3131,6 @@ try_onemore: sb->s_maxbytes = sbi->max_file_blocks << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; @@ -3116,7 +3149,7 @@ try_onemore: #endif sb->s_op = &f2fs_sops; -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &f2fs_cryptops; #endif sb->s_xattr = f2fs_xattr_handlers; @@ -3200,6 +3233,10 @@ try_onemore: if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) { + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL; + } /* Initialize device list */ err = f2fs_scan_devices(sbi); @@ -3288,7 +3325,7 @@ try_onemore: sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { err = -ENOMEM; - goto free_root_inode; + goto free_node_inode; } err = f2fs_register_sysfs(sbi); @@ -3310,7 +3347,7 @@ try_onemore: goto free_meta; if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) - goto skip_recovery; + goto reset_checkpoint; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -3327,11 +3364,13 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - if (!retry) - goto skip_recovery; + if (skip_recovery) + goto reset_checkpoint; err = f2fs_recover_fsync_data(sbi, false); if (err < 0) { + if (err != -ENOMEM) + skip_recovery = true; need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); @@ -3347,14 +3386,14 @@ try_onemore: goto free_meta; } } -skip_recovery: +reset_checkpoint: /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) - goto free_meta; + goto sync_free_meta; } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) { f2fs_enable_checkpoint(sbi); } @@ -3367,7 +3406,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) - goto free_meta; + goto sync_free_meta; } kvfree(options); @@ -3387,8 +3426,14 @@ skip_recovery: cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); + clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); return 0; +sync_free_meta: + /* safe to flush all the data */ + sync_filesystem(sbi->sb); + retry_cnt = 0; + free_meta: #ifdef CONFIG_QUOTA f2fs_truncate_quota_inode_pages(sb); @@ -3402,6 +3447,8 @@ free_meta: * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); + /* evict some inodes being cached by GC */ + evict_inodes(sb); f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); @@ -3410,6 +3457,7 @@ free_node_inode: f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); + sbi->node_inode = NULL; free_stats: f2fs_destroy_stats(sbi); free_nm: @@ -3422,6 +3470,7 @@ free_devices: free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); + sbi->meta_inode = NULL; free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_percpu: @@ -3443,8 +3492,8 @@ free_sbi: kvfree(sbi); /* give only one another chance */ - if (retry) { - retry = false; + if (retry_cnt > 0 && skip_recovery) { + retry_cnt--; shrink_dcache_sb(sb); goto try_onemore; } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 0575edbe3ed6..729f46a3c9ee 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -222,6 +222,8 @@ out: #ifdef CONFIG_F2FS_FAULT_INJECTION if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) return -EINVAL; + if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) + return -EINVAL; #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); @@ -278,10 +280,16 @@ out: return count; } - *ui = t; - if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) - f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "iostat_enable")) { + sbi->iostat_enable = !!t; + if (!sbi->iostat_enable) + f2fs_reset_iostat(sbi); + return count; + } + + *ui = (unsigned int)t; + return count; } @@ -418,6 +426,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, interval_time[DISCARD_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, + umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); @@ -431,7 +441,7 @@ F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); #endif #ifdef CONFIG_BLK_DEV_ZONED @@ -475,6 +485,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(idle_interval), ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), + ATTR_LIST(umount_discard_timeout), ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), @@ -492,7 +503,7 @@ static struct attribute *f2fs_attrs[] = { }; static struct attribute *f2fs_feat_attrs[] = { -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), #endif #ifdef CONFIG_BLK_DEV_ZONED diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index ce2a5eb210b6..d0ab533a9ce8 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -14,7 +14,7 @@ #include "trace.h" static RADIX_TREE(pids, GFP_ATOMIC); -static struct mutex pids_lock; +static spinlock_t pids_lock; static struct last_io_info last_io; static inline void __print_last_io(void) @@ -58,23 +58,29 @@ void f2fs_trace_pid(struct page *page) set_page_private(page, (unsigned long)pid); +retry: if (radix_tree_preload(GFP_NOFS)) return; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); p = radix_tree_lookup(&pids, pid); if (p == current) goto out; if (p) radix_tree_delete(&pids, pid); - f2fs_radix_tree_insert(&pids, pid, current); + if (radix_tree_insert(&pids, pid, current)) { + spin_unlock(&pids_lock); + radix_tree_preload_end(); + cond_resched(); + goto retry; + } trace_printk("%3x:%3x %4x %-16s\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), pid, current->comm); out: - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); radix_tree_preload_end(); } @@ -119,7 +125,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) void f2fs_build_trace_ios(void) { - mutex_init(&pids_lock); + spin_lock_init(&pids_lock); } #define PIDVEC_SIZE 128 @@ -147,7 +153,7 @@ void f2fs_destroy_trace_ios(void) pid_t next_pid = 0; unsigned int found; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { unsigned idx; @@ -155,5 +161,5 @@ void f2fs_destroy_trace_ios(void) for (idx = 0; idx < found; idx++) radix_tree_delete(&pids, pid[idx]); } - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 18d5ffbc5e8c..848a785abe25 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -224,11 +224,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, { struct f2fs_xattr_entry *entry; unsigned int inline_size = inline_xattr_size(inode); + void *max_addr = base_addr + inline_size; list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > base_addr + inline_size || - (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > - base_addr + inline_size) { + if ((void *)entry + sizeof(__u32) > max_addr || + (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { *last_addr = entry; return NULL; } @@ -239,6 +239,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, if (!memcmp(entry->e_name, name, len)) break; } + + /* inline xattr header or entry across max inline xattr size */ + if (IS_XATTR_LAST_ENTRY(entry) && + (void *)entry + sizeof(__u32) > max_addr) { + *last_addr = entry; + return NULL; + } return entry; } @@ -340,7 +347,7 @@ check: *base_addr = txattr_addr; return 0; out: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -383,7 +390,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, *base_addr = txattr_addr; return 0; fail: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -510,7 +517,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, } error = size; out: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -538,7 +545,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if (!handler || (handler->list && !handler->list(dentry))) continue; - prefix = handler->prefix ?: handler->name; + prefix = xattr_prefix(handler); prefix_len = strlen(prefix); size = prefix_len + entry->e_name_len + 1; if (buffer) { @@ -556,7 +563,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -687,7 +694,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: - kzfree(base_addr); + kvfree(base_addr); return error; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 67db134da0f5..9172ee082ca8 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -78,6 +78,12 @@ struct f2fs_xattr_entry { sizeof(struct f2fs_xattr_header) - \ sizeof(struct f2fs_xattr_entry)) +#define MAX_INLINE_XATTR_SIZE \ + (DEF_ADDRS_PER_INODE - \ + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \ + DEF_INLINE_RESERVED_SIZE - \ + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) + /* * On-disk structure of f2fs_xattr * We use inline xattrs space + 1 block for xattr. diff --git a/fs/filesystems.c b/fs/filesystems.c index b03f57b1105b..9135646e41ac 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/fs_parser.h> /* * Handling of filesystem drivers list. @@ -73,6 +74,9 @@ int register_filesystem(struct file_system_type * fs) int res = 0; struct file_system_type ** p; + if (fs->parameters && !fs_validate_description(fs->parameters)) + return -EINVAL; + BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; diff --git a/fs/fs_context.c b/fs/fs_context.c new file mode 100644 index 000000000000..87e3546b9a52 --- /dev/null +++ b/fs/fs_context.c @@ -0,0 +1,642 @@ +/* Provide a way to create a superblock configuration context within the kernel + * that allows a superblock to be set up prior to mounting. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/nsproxy.h> +#include <linux/slab.h> +#include <linux/magic.h> +#include <linux/security.h> +#include <linux/mnt_namespace.h> +#include <linux/pid_namespace.h> +#include <linux/user_namespace.h> +#include <net/net_namespace.h> +#include "mount.h" +#include "internal.h" + +enum legacy_fs_param { + LEGACY_FS_UNSET_PARAMS, + LEGACY_FS_MONOLITHIC_PARAMS, + LEGACY_FS_INDIVIDUAL_PARAMS, +}; + +struct legacy_fs_context { + char *legacy_data; /* Data page for legacy filesystems */ + size_t data_size; + enum legacy_fs_param param_type; +}; + +static int legacy_init_fs_context(struct fs_context *fc); + +static const struct constant_table common_set_sb_flag[] = { + { "dirsync", SB_DIRSYNC }, + { "lazytime", SB_LAZYTIME }, + { "mand", SB_MANDLOCK }, + { "posixacl", SB_POSIXACL }, + { "ro", SB_RDONLY }, + { "sync", SB_SYNCHRONOUS }, +}; + +static const struct constant_table common_clear_sb_flag[] = { + { "async", SB_SYNCHRONOUS }, + { "nolazytime", SB_LAZYTIME }, + { "nomand", SB_MANDLOCK }, + { "rw", SB_RDONLY }, + { "silent", SB_SILENT }, +}; + +static const char *const forbidden_sb_flag[] = { + "bind", + "dev", + "exec", + "move", + "noatime", + "nodev", + "nodiratime", + "noexec", + "norelatime", + "nostrictatime", + "nosuid", + "private", + "rec", + "relatime", + "remount", + "shared", + "slave", + "strictatime", + "suid", + "unbindable", +}; + +/* + * Check for a common mount option that manipulates s_flags. + */ +static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) +{ + unsigned int token; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++) + if (strcmp(key, forbidden_sb_flag[i]) == 0) + return -EINVAL; + + token = lookup_constant(common_set_sb_flag, key, 0); + if (token) { + fc->sb_flags |= token; + fc->sb_flags_mask |= token; + return 0; + } + + token = lookup_constant(common_clear_sb_flag, key, 0); + if (token) { + fc->sb_flags &= ~token; + fc->sb_flags_mask |= token; + return 0; + } + + return -ENOPARAM; +} + +/** + * vfs_parse_fs_param - Add a single parameter to a superblock config + * @fc: The filesystem context to modify + * @param: The parameter + * + * A single mount option in string form is applied to the filesystem context + * being set up. Certain standard options (for example "ro") are translated + * into flag bits without going to the filesystem. The active security module + * is allowed to observe and poach options. Any other options are passed over + * to the filesystem to parse. + * + * This may be called multiple times for a context. + * + * Returns 0 on success and a negative error code on failure. In the event of + * failure, supplementary error information may have been set. + */ +int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) +{ + int ret; + + if (!param->key) + return invalf(fc, "Unnamed parameter\n"); + + ret = vfs_parse_sb_flag(fc, param->key); + if (ret != -ENOPARAM) + return ret; + + ret = security_fs_context_parse_param(fc, param); + if (ret != -ENOPARAM) + /* Param belongs to the LSM or is disallowed by the LSM; so + * don't pass to the FS. + */ + return ret; + + if (fc->ops->parse_param) { + ret = fc->ops->parse_param(fc, param); + if (ret != -ENOPARAM) + return ret; + } + + /* If the filesystem doesn't take any arguments, give it the + * default handling of source. + */ + if (strcmp(param->key, "source") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Non-string source"); + if (fc->source) + return invalf(fc, "VFS: Multiple sources"); + fc->source = param->string; + param->string = NULL; + return 0; + } + + return invalf(fc, "%s: Unknown parameter '%s'", + fc->fs_type->name, param->key); +} +EXPORT_SYMBOL(vfs_parse_fs_param); + +/** + * vfs_parse_fs_string - Convenience function to just parse a string. + */ +int vfs_parse_fs_string(struct fs_context *fc, const char *key, + const char *value, size_t v_size) +{ + int ret; + + struct fs_parameter param = { + .key = key, + .type = fs_value_is_string, + .size = v_size, + }; + + if (v_size > 0) { + param.string = kmemdup_nul(value, v_size, GFP_KERNEL); + if (!param.string) + return -ENOMEM; + } + + ret = vfs_parse_fs_param(fc, ¶m); + kfree(param.string); + return ret; +} +EXPORT_SYMBOL(vfs_parse_fs_string); + +/** + * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data + * @ctx: The superblock configuration to fill in. + * @data: The data to parse + * + * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be + * called from the ->monolithic_mount_data() fs_context operation. + * + * Returns 0 on success or the error returned by the ->parse_option() fs_context + * operation on failure. + */ +int generic_parse_monolithic(struct fs_context *fc, void *data) +{ + char *options = data, *key; + int ret = 0; + + if (!options) + return 0; + + ret = security_sb_eat_lsm_opts(options, &fc->security); + if (ret) + return ret; + + while ((key = strsep(&options, ",")) != NULL) { + if (*key) { + size_t v_len = 0; + char *value = strchr(key, '='); + + if (value) { + if (value == key) + continue; + *value++ = 0; + v_len = strlen(value); + } + ret = vfs_parse_fs_string(fc, key, value, v_len); + if (ret < 0) + break; + } + } + + return ret; +} +EXPORT_SYMBOL(generic_parse_monolithic); + +/** + * alloc_fs_context - Create a filesystem context. + * @fs_type: The filesystem type. + * @reference: The dentry from which this one derives (or NULL) + * @sb_flags: Filesystem/superblock flags (SB_*) + * @sb_flags_mask: Applicable members of @sb_flags + * @purpose: The purpose that this configuration shall be used for. + * + * Open a filesystem and create a mount context. The mount context is + * initialised with the supplied flags and, if a submount/automount from + * another superblock (referred to by @reference) is supplied, may have + * parameters such as namespaces copied across from that superblock. + */ +static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, + struct dentry *reference, + unsigned int sb_flags, + unsigned int sb_flags_mask, + enum fs_context_purpose purpose) +{ + int (*init_fs_context)(struct fs_context *); + struct fs_context *fc; + int ret = -ENOMEM; + + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + if (!fc) + return ERR_PTR(-ENOMEM); + + fc->purpose = purpose; + fc->sb_flags = sb_flags; + fc->sb_flags_mask = sb_flags_mask; + fc->fs_type = get_filesystem(fs_type); + fc->cred = get_current_cred(); + fc->net_ns = get_net(current->nsproxy->net_ns); + + switch (purpose) { + case FS_CONTEXT_FOR_MOUNT: + fc->user_ns = get_user_ns(fc->cred->user_ns); + break; + case FS_CONTEXT_FOR_SUBMOUNT: + fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); + break; + case FS_CONTEXT_FOR_RECONFIGURE: + /* We don't pin any namespaces as the superblock's + * subscriptions cannot be changed at this point. + */ + atomic_inc(&reference->d_sb->s_active); + fc->root = dget(reference); + break; + } + + /* TODO: Make all filesystems support this unconditionally */ + init_fs_context = fc->fs_type->init_fs_context; + if (!init_fs_context) + init_fs_context = legacy_init_fs_context; + + ret = init_fs_context(fc); + if (ret < 0) + goto err_fc; + fc->need_free = true; + return fc; + +err_fc: + put_fs_context(fc); + return ERR_PTR(ret); +} + +struct fs_context *fs_context_for_mount(struct file_system_type *fs_type, + unsigned int sb_flags) +{ + return alloc_fs_context(fs_type, NULL, sb_flags, 0, + FS_CONTEXT_FOR_MOUNT); +} +EXPORT_SYMBOL(fs_context_for_mount); + +struct fs_context *fs_context_for_reconfigure(struct dentry *dentry, + unsigned int sb_flags, + unsigned int sb_flags_mask) +{ + return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags, + sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE); +} +EXPORT_SYMBOL(fs_context_for_reconfigure); + +struct fs_context *fs_context_for_submount(struct file_system_type *type, + struct dentry *reference) +{ + return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); +} +EXPORT_SYMBOL(fs_context_for_submount); + +void fc_drop_locked(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + dput(fc->root); + fc->root = NULL; + deactivate_locked_super(sb); +} + +static void legacy_fs_context_free(struct fs_context *fc); + +/** + * vfs_dup_fc_config: Duplicate a filesystem context. + * @src_fc: The context to copy. + */ +struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) +{ + struct fs_context *fc; + int ret; + + if (!src_fc->ops->dup) + return ERR_PTR(-EOPNOTSUPP); + + fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL); + if (!fc) + return ERR_PTR(-ENOMEM); + + fc->fs_private = NULL; + fc->s_fs_info = NULL; + fc->source = NULL; + fc->security = NULL; + get_filesystem(fc->fs_type); + get_net(fc->net_ns); + get_user_ns(fc->user_ns); + get_cred(fc->cred); + + /* Can't call put until we've called ->dup */ + ret = fc->ops->dup(fc, src_fc); + if (ret < 0) + goto err_fc; + + ret = security_fs_context_dup(fc, src_fc); + if (ret < 0) + goto err_fc; + return fc; + +err_fc: + put_fs_context(fc); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(vfs_dup_fs_context); + +#ifdef CONFIG_PRINTK +/** + * logfc - Log a message to a filesystem context + * @fc: The filesystem context to log to. + * @fmt: The format of the buffer. + */ +void logfc(struct fs_context *fc, const char *fmt, ...) +{ + va_list va; + + va_start(va, fmt); + + switch (fmt[0]) { + case 'w': + vprintk_emit(0, LOGLEVEL_WARNING, NULL, 0, fmt, va); + break; + case 'e': + vprintk_emit(0, LOGLEVEL_ERR, NULL, 0, fmt, va); + break; + default: + vprintk_emit(0, LOGLEVEL_NOTICE, NULL, 0, fmt, va); + break; + } + + pr_cont("\n"); + va_end(va); +} +EXPORT_SYMBOL(logfc); +#endif + +/** + * put_fs_context - Dispose of a superblock configuration context. + * @fc: The context to dispose of. + */ +void put_fs_context(struct fs_context *fc) +{ + struct super_block *sb; + + if (fc->root) { + sb = fc->root->d_sb; + dput(fc->root); + fc->root = NULL; + deactivate_super(sb); + } + + if (fc->need_free && fc->ops && fc->ops->free) + fc->ops->free(fc); + + security_free_mnt_opts(&fc->security); + put_net(fc->net_ns); + put_user_ns(fc->user_ns); + put_cred(fc->cred); + kfree(fc->subtype); + put_filesystem(fc->fs_type); + kfree(fc->source); + kfree(fc); +} +EXPORT_SYMBOL(put_fs_context); + +/* + * Free the config for a filesystem that doesn't support fs_context. + */ +static void legacy_fs_context_free(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + + if (ctx) { + if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) + kfree(ctx->legacy_data); + kfree(ctx); + } +} + +/* + * Duplicate a legacy config. + */ +static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) +{ + struct legacy_fs_context *ctx; + struct legacy_fs_context *src_ctx = src_fc->fs_private; + + ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) { + ctx->legacy_data = kmemdup(src_ctx->legacy_data, + src_ctx->data_size, GFP_KERNEL); + if (!ctx->legacy_data) { + kfree(ctx); + return -ENOMEM; + } + } + + fc->fs_private = ctx; + return 0; +} + +/* + * Add a parameter to a legacy config. We build up a comma-separated list of + * options. + */ +static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct legacy_fs_context *ctx = fc->fs_private; + unsigned int size = ctx->data_size; + size_t len = 0; + + if (strcmp(param->key, "source") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Legacy: Non-string source"); + if (fc->source) + return invalf(fc, "VFS: Legacy: Multiple sources"); + fc->source = param->string; + param->string = NULL; + return 0; + } + + if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) && + strcmp(param->key, "subtype") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Legacy: Non-string subtype"); + if (fc->subtype) + return invalf(fc, "VFS: Legacy: Multiple subtype"); + fc->subtype = param->string; + param->string = NULL; + return 0; + } + + if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) + return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); + + switch (param->type) { + case fs_value_is_string: + len = 1 + param->size; + /* Fall through */ + case fs_value_is_flag: + len += strlen(param->key); + break; + default: + return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported", + param->key); + } + + if (len > PAGE_SIZE - 2 - size) + return invalf(fc, "VFS: Legacy: Cumulative options too large"); + if (strchr(param->key, ',') || + (param->type == fs_value_is_string && + memchr(param->string, ',', param->size))) + return invalf(fc, "VFS: Legacy: Option '%s' contained comma", + param->key); + if (!ctx->legacy_data) { + ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!ctx->legacy_data) + return -ENOMEM; + } + + ctx->legacy_data[size++] = ','; + len = strlen(param->key); + memcpy(ctx->legacy_data + size, param->key, len); + size += len; + if (param->type == fs_value_is_string) { + ctx->legacy_data[size++] = '='; + memcpy(ctx->legacy_data + size, param->string, param->size); + size += param->size; + } + ctx->legacy_data[size] = '\0'; + ctx->data_size = size; + ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS; + return 0; +} + +/* + * Add monolithic mount data. + */ +static int legacy_parse_monolithic(struct fs_context *fc, void *data) +{ + struct legacy_fs_context *ctx = fc->fs_private; + + if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) { + pr_warn("VFS: Can't mix monolithic and individual options\n"); + return -EINVAL; + } + + ctx->legacy_data = data; + ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS; + if (!ctx->legacy_data) + return 0; + + if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA) + return 0; + return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security); +} + +/* + * Get a mountable root with the legacy mount command. + */ +static int legacy_get_tree(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + struct super_block *sb; + struct dentry *root; + + root = fc->fs_type->mount(fc->fs_type, fc->sb_flags, + fc->source, ctx->legacy_data); + if (IS_ERR(root)) + return PTR_ERR(root); + + sb = root->d_sb; + BUG_ON(!sb); + + fc->root = root; + return 0; +} + +/* + * Handle remount. + */ +static int legacy_reconfigure(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; + + if (!sb->s_op->remount_fs) + return 0; + + return sb->s_op->remount_fs(sb, &fc->sb_flags, + ctx ? ctx->legacy_data : NULL); +} + +const struct fs_context_operations legacy_fs_context_ops = { + .free = legacy_fs_context_free, + .dup = legacy_fs_context_dup, + .parse_param = legacy_parse_param, + .parse_monolithic = legacy_parse_monolithic, + .get_tree = legacy_get_tree, + .reconfigure = legacy_reconfigure, +}; + +/* + * Initialise a legacy context for a filesystem that doesn't support + * fs_context. + */ +static int legacy_init_fs_context(struct fs_context *fc) +{ + fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL); + if (!fc->fs_private) + return -ENOMEM; + fc->ops = &legacy_fs_context_ops; + return 0; +} + +int parse_monolithic_mount_data(struct fs_context *fc, void *data) +{ + int (*monolithic_mount_data)(struct fs_context *, void *); + + monolithic_mount_data = fc->ops->parse_monolithic; + if (!monolithic_mount_data) + monolithic_mount_data = generic_parse_monolithic; + + return monolithic_mount_data(fc, data); +} diff --git a/fs/fs_parser.c b/fs/fs_parser.c new file mode 100644 index 000000000000..570d71043acf --- /dev/null +++ b/fs/fs_parser.c @@ -0,0 +1,447 @@ +/* Filesystem parameter parser. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/export.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/slab.h> +#include <linux/security.h> +#include <linux/namei.h> +#include "internal.h" + +static const struct constant_table bool_names[] = { + { "0", false }, + { "1", true }, + { "false", false }, + { "no", false }, + { "true", true }, + { "yes", true }, +}; + +/** + * lookup_constant - Look up a constant by name in an ordered table + * @tbl: The table of constants to search. + * @tbl_size: The size of the table. + * @name: The name to look up. + * @not_found: The value to return if the name is not found. + */ +int __lookup_constant(const struct constant_table *tbl, size_t tbl_size, + const char *name, int not_found) +{ + unsigned int i; + + for (i = 0; i < tbl_size; i++) + if (strcmp(name, tbl[i].name) == 0) + return tbl[i].value; + + return not_found; +} +EXPORT_SYMBOL(__lookup_constant); + +static const struct fs_parameter_spec *fs_lookup_key( + const struct fs_parameter_description *desc, + const char *name) +{ + const struct fs_parameter_spec *p; + + if (!desc->specs) + return NULL; + + for (p = desc->specs; p->name; p++) + if (strcmp(p->name, name) == 0) + return p; + + return NULL; +} + +/* + * fs_parse - Parse a filesystem configuration parameter + * @fc: The filesystem context to log errors through. + * @desc: The parameter description to use. + * @param: The parameter. + * @result: Where to place the result of the parse + * + * Parse a filesystem configuration parameter and attempt a conversion for a + * simple parameter for which this is requested. If successful, the determined + * parameter ID is placed into @result->key, the desired type is indicated in + * @result->t and any converted value is placed into an appropriate member of + * the union in @result. + * + * The function returns the parameter number if the parameter was matched, + * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that + * unknown parameters are okay and -EINVAL if there was a conversion issue or + * the parameter wasn't recognised and unknowns aren't okay. + */ +int fs_parse(struct fs_context *fc, + const struct fs_parameter_description *desc, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + const struct fs_parameter_spec *p; + const struct fs_parameter_enum *e; + int ret = -ENOPARAM, b; + + result->has_value = !!param->string; + result->negated = false; + result->uint_64 = 0; + + p = fs_lookup_key(desc, param->key); + if (!p) { + /* If we didn't find something that looks like "noxxx", see if + * "xxx" takes the "no"-form negative - but only if there + * wasn't an value. + */ + if (result->has_value) + goto unknown_parameter; + if (param->key[0] != 'n' || param->key[1] != 'o' || !param->key[2]) + goto unknown_parameter; + + p = fs_lookup_key(desc, param->key + 2); + if (!p) + goto unknown_parameter; + if (!(p->flags & fs_param_neg_with_no)) + goto unknown_parameter; + result->boolean = false; + result->negated = true; + } + + if (p->flags & fs_param_deprecated) + warnf(fc, "%s: Deprecated parameter '%s'", + desc->name, param->key); + + if (result->negated) + goto okay; + + /* Certain parameter types only take a string and convert it. */ + switch (p->type) { + case __fs_param_wasnt_defined: + return -EINVAL; + case fs_param_is_u32: + case fs_param_is_u32_octal: + case fs_param_is_u32_hex: + case fs_param_is_s32: + case fs_param_is_u64: + case fs_param_is_enum: + case fs_param_is_string: + if (param->type != fs_value_is_string) + goto bad_value; + if (!result->has_value) { + if (p->flags & fs_param_v_optional) + goto okay; + goto bad_value; + } + /* Fall through */ + default: + break; + } + + /* Try to turn the type we were given into the type desired by the + * parameter and give an error if we can't. + */ + switch (p->type) { + case fs_param_is_flag: + if (param->type != fs_value_is_flag && + (param->type != fs_value_is_string || result->has_value)) + return invalf(fc, "%s: Unexpected value for '%s'", + desc->name, param->key); + result->boolean = true; + goto okay; + + case fs_param_is_bool: + switch (param->type) { + case fs_value_is_flag: + result->boolean = true; + goto okay; + case fs_value_is_string: + if (param->size == 0) { + result->boolean = true; + goto okay; + } + b = lookup_constant(bool_names, param->string, -1); + if (b == -1) + goto bad_value; + result->boolean = b; + goto okay; + default: + goto bad_value; + } + + case fs_param_is_u32: + ret = kstrtouint(param->string, 0, &result->uint_32); + goto maybe_okay; + case fs_param_is_u32_octal: + ret = kstrtouint(param->string, 8, &result->uint_32); + goto maybe_okay; + case fs_param_is_u32_hex: + ret = kstrtouint(param->string, 16, &result->uint_32); + goto maybe_okay; + case fs_param_is_s32: + ret = kstrtoint(param->string, 0, &result->int_32); + goto maybe_okay; + case fs_param_is_u64: + ret = kstrtoull(param->string, 0, &result->uint_64); + goto maybe_okay; + + case fs_param_is_enum: + for (e = desc->enums; e->name[0]; e++) { + if (e->opt == p->opt && + strcmp(e->name, param->string) == 0) { + result->uint_32 = e->value; + goto okay; + } + } + goto bad_value; + + case fs_param_is_string: + goto okay; + case fs_param_is_blob: + if (param->type != fs_value_is_blob) + goto bad_value; + goto okay; + + case fs_param_is_fd: { + if (param->type != fs_value_is_file) + goto bad_value; + goto okay; + } + + case fs_param_is_blockdev: + case fs_param_is_path: + goto okay; + default: + BUG(); + } + +maybe_okay: + if (ret < 0) + goto bad_value; +okay: + return p->opt; + +bad_value: + return invalf(fc, "%s: Bad value for '%s'", desc->name, param->key); +unknown_parameter: + return -ENOPARAM; +} +EXPORT_SYMBOL(fs_parse); + +/** + * fs_lookup_param - Look up a path referred to by a parameter + * @fc: The filesystem context to log errors through. + * @param: The parameter. + * @want_bdev: T if want a blockdev + * @_path: The result of the lookup + */ +int fs_lookup_param(struct fs_context *fc, + struct fs_parameter *param, + bool want_bdev, + struct path *_path) +{ + struct filename *f; + unsigned int flags = 0; + bool put_f; + int ret; + + switch (param->type) { + case fs_value_is_string: + f = getname_kernel(param->string); + if (IS_ERR(f)) + return PTR_ERR(f); + put_f = true; + break; + case fs_value_is_filename_empty: + flags = LOOKUP_EMPTY; + /* Fall through */ + case fs_value_is_filename: + f = param->name; + put_f = false; + break; + default: + return invalf(fc, "%s: not usable as path", param->key); + } + + ret = filename_lookup(param->dirfd, f, flags, _path, NULL); + if (ret < 0) { + errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name); + goto out; + } + + if (want_bdev && + !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) { + path_put(_path); + _path->dentry = NULL; + _path->mnt = NULL; + errorf(fc, "%s: Non-blockdev passed as '%s'", + param->key, f->name); + ret = -ENOTBLK; + } + +out: + if (put_f) + putname(f); + return ret; +} +EXPORT_SYMBOL(fs_lookup_param); + +#ifdef CONFIG_VALIDATE_FS_PARSER +/** + * validate_constant_table - Validate a constant table + * @name: Name to use in reporting + * @tbl: The constant table to validate. + * @tbl_size: The size of the table. + * @low: The lowest permissible value. + * @high: The highest permissible value. + * @special: One special permissible value outside of the range. + */ +bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, + int low, int high, int special) +{ + size_t i; + bool good = true; + + if (tbl_size == 0) { + pr_warn("VALIDATE C-TBL: Empty\n"); + return true; + } + + for (i = 0; i < tbl_size; i++) { + if (!tbl[i].name) { + pr_err("VALIDATE C-TBL[%zu]: Null\n", i); + good = false; + } else if (i > 0 && tbl[i - 1].name) { + int c = strcmp(tbl[i-1].name, tbl[i].name); + + if (c == 0) { + pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n", + i, tbl[i].name); + good = false; + } + if (c > 0) { + pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n", + i, tbl[i-1].name, tbl[i].name); + good = false; + } + } + + if (tbl[i].value != special && + (tbl[i].value < low || tbl[i].value > high)) { + pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n", + i, tbl[i].name, tbl[i].value, low, high); + good = false; + } + } + + return good; +} + +/** + * fs_validate_description - Validate a parameter description + * @desc: The parameter description to validate. + */ +bool fs_validate_description(const struct fs_parameter_description *desc) +{ + const struct fs_parameter_spec *param, *p2; + const struct fs_parameter_enum *e; + const char *name = desc->name; + unsigned int nr_params = 0; + bool good = true, enums = false; + + pr_notice("*** VALIDATE %s ***\n", name); + + if (!name[0]) { + pr_err("VALIDATE Parser: No name\n"); + name = "Unknown"; + good = false; + } + + if (desc->specs) { + for (param = desc->specs; param->name; param++) { + enum fs_parameter_type t = param->type; + + /* Check that the type is in range */ + if (t == __fs_param_wasnt_defined || + t >= nr__fs_parameter_type) { + pr_err("VALIDATE %s: PARAM[%s] Bad type %u\n", + name, param->name, t); + good = false; + } else if (t == fs_param_is_enum) { + enums = true; + } + + /* Check for duplicate parameter names */ + for (p2 = desc->specs; p2 < param; p2++) { + if (strcmp(param->name, p2->name) == 0) { + pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n", + name, param->name); + good = false; + } + } + } + + nr_params = param - desc->specs; + } + + if (desc->enums) { + if (!nr_params) { + pr_err("VALIDATE %s: Enum table but no parameters\n", + name); + good = false; + goto no_enums; + } + if (!enums) { + pr_err("VALIDATE %s: Enum table but no enum-type values\n", + name); + good = false; + goto no_enums; + } + + for (e = desc->enums; e->name[0]; e++) { + /* Check that all entries in the enum table have at + * least one parameter that uses them. + */ + for (param = desc->specs; param->name; param++) { + if (param->opt == e->opt && + param->type != fs_param_is_enum) { + pr_err("VALIDATE %s: e[%tu] enum val for %s\n", + name, e - desc->enums, param->name); + good = false; + } + } + } + + /* Check that all enum-type parameters have at least one enum + * value in the enum table. + */ + for (param = desc->specs; param->name; param++) { + if (param->type != fs_param_is_enum) + continue; + for (e = desc->enums; e->name[0]; e++) + if (e->opt == param->opt) + break; + if (!e->name[0]) { + pr_err("VALIDATE %s: PARAM[%s] enum with no values\n", + name, param->name); + good = false; + } + } + } else { + if (enums) { + pr_err("VALIDATE %s: enum-type values, but no enum table\n", + name); + good = false; + goto no_enums; + } + } + +no_enums: + return good; +} +#endif /* CONFIG_VALIDATE_FS_PARSER */ diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 989df5accaee..fe80bea4ad89 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -35,7 +35,9 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { - fuse_abort_conn(fc, true); + if (fc->abort_err) + fc->aborted = true; + fuse_abort_conn(fc); fuse_conn_put(fc); } return count; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 8f68181256c0..55a26f351467 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -141,10 +141,11 @@ static int cuse_open(struct inode *inode, struct file *file) static int cuse_release(struct inode *inode, struct file *file) { + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - fuse_sync_release(ff, file->f_flags); + fuse_sync_release(fi, ff, file->f_flags); fuse_conn_put(fc); return 0; @@ -407,7 +408,7 @@ err_unlock: err_region: unregister_chrdev_region(devt, 1); err: - fuse_abort_conn(fc, false); + fuse_abort_conn(fc); goto out; } @@ -586,7 +587,7 @@ static ssize_t cuse_class_abort_store(struct device *dev, { struct cuse_conn *cc = dev_get_drvdata(dev); - fuse_abort_conn(&cc->fc, false); + fuse_abort_conn(&cc->fc); return count; } static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 809c0f2f9942..9971a35cf1ef 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -251,17 +251,18 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc, struct file *file) { struct fuse_req *req = NULL; + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; do { wait_event(fc->reserved_req_waitq, ff->reserved_req); - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (ff->reserved_req) { req = ff->reserved_req; ff->reserved_req = NULL; req->stolen_file = get_file(file); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } while (!req); return req; @@ -273,16 +274,17 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc, static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) { struct file *file = req->stolen_file; + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; WARN_ON(req->max_pages); - spin_lock(&fc->lock); + spin_lock(&fi->lock); memset(req, 0, sizeof(*req)); fuse_request_init(req, NULL, NULL, 0); BUG_ON(ff->reserved_req); ff->reserved_req = req; wake_up_all(&fc->reserved_req_waitq); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fput(file); } @@ -431,10 +433,16 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; - - spin_lock(&fiq->waitq.lock); - list_del_init(&req->intr_entry); - spin_unlock(&fiq->waitq.lock); + /* + * test_and_set_bit() implies smp_mb() between bit + * changing and below intr_entry check. Pairs with + * smp_mb() from queue_interrupt(). + */ + if (!list_empty(&req->intr_entry)) { + spin_lock(&fiq->waitq.lock); + list_del_init(&req->intr_entry); + spin_unlock(&fiq->waitq.lock); + } WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); if (test_bit(FR_BACKGROUND, &req->flags)) { @@ -462,27 +470,43 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) fc->active_background--; flush_bg_queue(fc); spin_unlock(&fc->bg_lock); + } else { + /* Wake up waiter sleeping in request_wait_answer() */ + wake_up(&req->waitq); } - wake_up(&req->waitq); + if (req->end) req->end(fc, req); put_request: fuse_put_request(fc, req); } -static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->waitq.lock); - if (test_bit(FR_FINISHED, &req->flags)) { + /* Check for we've sent request to interrupt this req */ + if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { spin_unlock(&fiq->waitq.lock); - return; + return -EINVAL; } + if (list_empty(&req->intr_entry)) { list_add_tail(&req->intr_entry, &fiq->interrupts); + /* + * Pairs with smp_mb() implied by test_and_set_bit() + * from request_end(). + */ + smp_mb(); + if (test_bit(FR_FINISHED, &req->flags)) { + list_del_init(&req->intr_entry); + spin_unlock(&fiq->waitq.lock); + return 0; + } wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } spin_unlock(&fiq->waitq.lock); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + return 0; } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) @@ -1306,7 +1330,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, goto err_unlock; if (!fiq->connected) { - err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; + err = fc->aborted ? -ECONNABORTED : -ENODEV; goto err_unlock; } @@ -1353,7 +1377,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); if (!fpq->connected) { - err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; + err = fc->aborted ? -ECONNABORTED : -ENODEV; goto out_end; } if (err) { @@ -1900,16 +1924,17 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_req *req; struct fuse_out_header oh; + err = -EINVAL; if (nbytes < sizeof(struct fuse_out_header)) - return -EINVAL; + goto out; err = fuse_copy_one(cs, &oh, sizeof(oh)); if (err) - goto err_finish; + goto copy_finish; err = -EINVAL; if (oh.len != nbytes) - goto err_finish; + goto copy_finish; /* * Zero oh.unique indicates unsolicited notification message @@ -1917,41 +1942,40 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, */ if (!oh.unique) { err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); - return err ? err : nbytes; + goto out; } err = -EINVAL; if (oh.error <= -1000 || oh.error > 0) - goto err_finish; + goto copy_finish; spin_lock(&fpq->lock); - err = -ENOENT; - if (!fpq->connected) - goto err_unlock_pq; + req = NULL; + if (fpq->connected) + req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); - req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); - if (!req) - goto err_unlock_pq; + err = -ENOENT; + if (!req) { + spin_unlock(&fpq->lock); + goto copy_finish; + } /* Is it an interrupt reply ID? */ if (oh.unique & FUSE_INT_REQ_BIT) { __fuse_get_request(req); spin_unlock(&fpq->lock); - err = -EINVAL; - if (nbytes != sizeof(struct fuse_out_header)) { - fuse_put_request(fc, req); - goto err_finish; - } - - if (oh.error == -ENOSYS) + err = 0; + if (nbytes != sizeof(struct fuse_out_header)) + err = -EINVAL; + else if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) - queue_interrupt(&fc->iq, req); + err = queue_interrupt(&fc->iq, req); + fuse_put_request(fc, req); - fuse_copy_finish(cs); - return nbytes; + goto copy_finish; } clear_bit(FR_SENT, &req->flags); @@ -1977,14 +2001,12 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_unlock(&fpq->lock); request_end(fc, req); - +out: return err ? err : nbytes; - err_unlock_pq: - spin_unlock(&fpq->lock); - err_finish: +copy_finish: fuse_copy_finish(cs); - return err; + goto out; } static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) @@ -2034,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; ret = -EINVAL; - if (rem < len) { - pipe_unlock(pipe); - goto out; - } + if (rem < len) + goto out_free; rem = len; while (rem) { @@ -2055,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; } else { - pipe_buf_get(pipe, ibuf); + if (!pipe_buf_get(pipe, ibuf)) + goto out_free; + *obuf = *ibuf; obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = rem; @@ -2078,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); pipe_lock(pipe); +out_free: for (idx = 0; idx < nbuf; idx++) pipe_buf_release(pipe, &bufs[idx]); pipe_unlock(pipe); -out: kvfree(bufs); return ret; } @@ -2109,11 +2131,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) return mask; } -/* - * Abort all requests on the given list (pending or processing) - * - * This function releases and reacquires fc->lock - */ +/* Abort all requests on the given list (pending or processing) */ static void end_requests(struct fuse_conn *fc, struct list_head *head) { while (!list_empty(head)) { @@ -2159,7 +2177,7 @@ static void end_polls(struct fuse_conn *fc) * is OK, the request will in that case be removed from the list before we touch * it. */ -void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) +void fuse_abort_conn(struct fuse_conn *fc) { struct fuse_iqueue *fiq = &fc->iq; @@ -2175,7 +2193,6 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) fc->connected = 0; spin_unlock(&fc->bg_lock); - fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { struct fuse_pqueue *fpq = &fud->pq; @@ -2253,7 +2270,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { WARN_ON(fc->iq.fasync != NULL); - fuse_abort_conn(fc, false); + fuse_abort_conn(fc); } fuse_dev_free(fud); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index e909678afa2d..dd0f64f7bc06 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -149,21 +149,6 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, args->out.args[0].value = outarg; } -u64 fuse_get_attr_version(struct fuse_conn *fc) -{ - u64 curr_version; - - /* - * The spin lock isn't actually needed on 64bit archs, but we - * don't yet care too much about such optimizations. - */ - spin_lock(&fc->lock); - curr_version = fc->attr_version; - spin_unlock(&fc->lock); - - return curr_version; -} - /* * Check whether the dentry is still valid * @@ -222,9 +207,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) fuse_queue_forget(fc, forget, outarg.nodeid, 1); goto invalid; } - spin_lock(&fc->lock); + spin_lock(&fi->lock); fi->nlookup++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } kfree(forget); if (ret == -ENOMEM) @@ -400,6 +385,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_create_in inarg; struct fuse_open_out outopen; struct fuse_entry_out outentry; + struct fuse_inode *fi; struct fuse_file *ff; /* Userspace expects S_IFREG in create mode */ @@ -451,7 +437,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, &outentry.attr, entry_attr_timeout(&outentry), 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); - fuse_sync_release(ff, flags); + fuse_sync_release(NULL, ff, flags); fuse_queue_forget(fc, forget, outentry.nodeid, 1); err = -ENOMEM; goto out_err; @@ -462,7 +448,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, fuse_dir_changed(dir); err = finish_open(file, entry, generic_file_open); if (err) { - fuse_sync_release(ff, flags); + fi = get_fuse_inode(inode); + fuse_sync_release(fi, ff, flags); } else { file->private_data = ff; fuse_finish_open(inode, file); @@ -671,8 +658,8 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) struct inode *inode = d_inode(entry); struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); /* * If i_nlink == 0 then unlink doesn't make sense, yet this can * happen if userspace filesystem is careless. It would be @@ -681,7 +668,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) */ if (inode->i_nlink > 0) drop_nlink(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_invalidate_attr(inode); fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); @@ -825,10 +812,10 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, if (!err) { struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); inc_nlink(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_invalidate_attr(inode); fuse_update_ctime(inode); } else if (err == -EINTR) { @@ -1356,15 +1343,14 @@ static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, */ void fuse_set_nowrite(struct inode *inode) { - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(!inode_is_locked(inode)); - spin_lock(&fc->lock); + spin_lock(&fi->lock); BUG_ON(fi->writectr < 0); fi->writectr += FUSE_NOWRITE; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE); } @@ -1385,11 +1371,11 @@ static void __fuse_release_nowrite(struct inode *inode) void fuse_release_nowrite(struct inode *inode) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); __fuse_release_nowrite(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, @@ -1524,7 +1510,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, goto error; } - spin_lock(&fc->lock); + spin_lock(&fi->lock); /* the kernel maintains i_mtime locally */ if (trust_local_cmtime) { if (attr->ia_valid & ATTR_MTIME) @@ -1542,10 +1528,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, i_size_write(inode, outarg.attr.size); if (is_truncate) { - /* NOTE: this may release/reacquire fc->lock */ + /* NOTE: this may release/reacquire fi->lock */ __fuse_release_nowrite(inode); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); /* * Only call invalidate_inode_pages2() after removing diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a59c16bd90ac..06096b60f1df 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,8 +19,6 @@ #include <linux/falloc.h> #include <linux/uio.h> -static const struct file_operations fuse_direct_io_file_operations; - static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { @@ -64,9 +62,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); - spin_lock(&fc->lock); - ff->kh = ++fc->khctr; - spin_unlock(&fc->lock); + ff->kh = atomic64_inc_return(&fc->khctr); return ff; } @@ -94,7 +90,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) if (refcount_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - if (ff->fc->no_open && !isdir) { + if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { /* * Drop the release request when client does not * implement 'open' @@ -128,8 +124,9 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, return -ENOMEM; ff->fh = 0; - ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */ - if (!fc->no_open || isdir) { + /* Default for no-open */ + ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); + if (isdir ? !fc->no_opendir : !fc->no_open) { struct fuse_open_out outarg; int err; @@ -138,11 +135,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; - } else if (err != -ENOSYS || isdir) { + } else if (err != -ENOSYS) { fuse_file_free(ff); return err; } else { - fc->no_open = 1; + if (isdir) + fc->no_opendir = 1; + else + fc->no_open = 1; } } @@ -159,17 +159,16 @@ EXPORT_SYMBOL_GPL(fuse_do_open); static void fuse_link_write_file(struct file *file) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; /* * file may be written through mmap, so chain it onto the * inodes's write_file list */ - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (list_empty(&ff->write_entry)) list_add(&ff->write_entry, &fi->write_files); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } void fuse_finish_open(struct inode *inode, struct file *file) @@ -177,8 +176,6 @@ void fuse_finish_open(struct inode *inode, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); - if (ff->open_flags & FOPEN_DIRECT_IO) - file->f_op = &fuse_direct_io_file_operations; if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_NONSEEKABLE) @@ -186,10 +183,10 @@ void fuse_finish_open(struct inode *inode, struct file *file) if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_invalidate_attr(inode); if (fc->writeback_cache) file_update_time(file); @@ -224,14 +221,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) return err; } -static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) +static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, + int flags, int opcode) { struct fuse_conn *fc = ff->fc; struct fuse_req *req = ff->reserved_req; struct fuse_release_in *inarg = &req->misc.release.in; + /* Inode is NULL on error path of fuse_create_open() */ + if (likely(fi)) { + spin_lock(&fi->lock); + list_del(&ff->write_entry); + spin_unlock(&fi->lock); + } spin_lock(&fc->lock); - list_del(&ff->write_entry); if (!RB_EMPTY_NODE(&ff->polled_node)) rb_erase(&ff->polled_node, &fc->polled_files); spin_unlock(&fc->lock); @@ -249,11 +252,12 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) void fuse_release_common(struct file *file, bool isdir) { + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; struct fuse_req *req = ff->reserved_req; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(ff, file->f_flags, opcode); + fuse_prepare_release(fi, ff, file->f_flags, opcode); if (ff->flock) { struct fuse_release_in *inarg = &req->misc.release.in; @@ -295,10 +299,10 @@ static int fuse_release(struct inode *inode, struct file *file) return 0; } -void fuse_sync_release(struct fuse_file *ff, int flags) +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags) { WARN_ON(refcount_read(&ff->count) > 1); - fuse_prepare_release(ff, flags, FUSE_RELEASE); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); /* * iput(NULL) is a no-op and since the refcount is 1 and everything's * synchronous, we are fine with not doing igrab() here" @@ -329,33 +333,39 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) return (u64) v0 + ((u64) v1 << 32); } -/* - * Check if any page in a range is under writeback - * - * This is currently done by walking the list of writepage requests - * for the inode, which can be pretty inefficient. - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) +static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi, + pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_req *req; - bool found = false; - spin_lock(&fc->lock); list_for_each_entry(req, &fi->writepages, writepages_entry) { pgoff_t curr_index; - BUG_ON(req->inode != inode); + WARN_ON(get_fuse_inode(req->inode) != fi); curr_index = req->misc.write.in.offset >> PAGE_SHIFT; if (idx_from < curr_index + req->num_pages && curr_index <= idx_to) { - found = true; - break; + return req; } } - spin_unlock(&fc->lock); + return NULL; +} + +/* + * Check if any page in a range is under writeback + * + * This is currently done by walking the list of writepage requests + * for the inode, which can be pretty inefficient. + */ +static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, + pgoff_t idx_to) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + bool found; + + spin_lock(&fi->lock); + found = fuse_find_writeback(fi, idx_from, idx_to); + spin_unlock(&fi->lock); return found; } @@ -598,9 +608,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; - spin_unlock(&fc->lock); + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + spin_unlock(&fi->lock); } io->iocb->ki_complete(io->iocb, res, 0); @@ -675,13 +685,13 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (attr_ver == fi->attr_version && size < inode->i_size && !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { - fi->attr_version = ++fc->attr_version; + fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, size); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } static void fuse_short_read(struct fuse_req *req, struct inode *inode, @@ -919,7 +929,7 @@ out: return err; } -static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = iocb->ki_filp->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); @@ -996,13 +1006,13 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) struct fuse_inode *fi = get_fuse_inode(inode); bool ret = false; - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); if (pos > inode->i_size) { i_size_write(inode, pos); ret = true; } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); return ret; } @@ -1125,9 +1135,6 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, int err = 0; ssize_t res = 0; - if (is_bad_inode(inode)) - return -EIO; - if (inode->i_size < pos + iov_iter_count(ii)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); @@ -1173,7 +1180,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, return res > 0 ? res : err; } -static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -1416,9 +1423,6 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, ssize_t res; struct inode *inode = file_inode(io->iocb->ki_filp); - if (is_bad_inode(inode)) - return -EIO; - res = fuse_direct_io(io, iter, ppos, 0); fuse_invalidate_atime(inode); @@ -1426,10 +1430,21 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, return res; } +static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter); + static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); - return __fuse_direct_read(&io, to, &iocb->ki_pos); + ssize_t res; + + if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + res = fuse_direct_IO(iocb, to); + } else { + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + + res = __fuse_direct_read(&io, to, &iocb->ki_pos); + } + + return res; } static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1438,14 +1453,17 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - if (is_bad_inode(inode)) - return -EIO; - /* Don't allow parallel writes to the same file */ inode_lock(inode); res = generic_write_checks(iocb, from); - if (res > 0) - res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + if (res > 0) { + if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + res = fuse_direct_IO(iocb, from); + } else { + res = fuse_direct_io(&io, from, &iocb->ki_pos, + FUSE_DIO_WRITE); + } + } fuse_invalidate_attr(inode); if (res > 0) fuse_write_update_size(inode, iocb->ki_pos); @@ -1454,6 +1472,34 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) return res; } +static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + + if (is_bad_inode(file_inode(file))) + return -EIO; + + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_cache_read_iter(iocb, to); + else + return fuse_direct_read_iter(iocb, to); +} + +static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + + if (is_bad_inode(file_inode(file))) + return -EIO; + + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_cache_write_iter(iocb, from); + else + return fuse_direct_write_iter(iocb, from); +} + static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { int i; @@ -1481,20 +1527,18 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) wake_up(&fi->page_waitq); } -/* Called under fc->lock, may release and reacquire it */ +/* Called under fi->lock, may release and reacquire it */ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, loff_t size) -__releases(fc->lock) -__acquires(fc->lock) +__releases(fi->lock) +__acquires(fi->lock) { + struct fuse_req *aux, *next; struct fuse_inode *fi = get_fuse_inode(req->inode); struct fuse_write_in *inarg = &req->misc.write.in; __u64 data_size = req->num_pages * PAGE_SIZE; bool queued; - if (!fc->connected) - goto out_free; - if (inarg->offset + data_size <= size) { inarg->size = data_size; } else if (inarg->offset < size) { @@ -1505,28 +1549,40 @@ __acquires(fc->lock) } req->in.args[1].size = inarg->size; - fi->writectr++; queued = fuse_request_queue_background(fc, req); - WARN_ON(!queued); + /* Fails on broken connection only */ + if (unlikely(!queued)) + goto out_free; + + fi->writectr++; return; out_free: fuse_writepage_finish(fc, req); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); + + /* After fuse_writepage_finish() aux request list is private */ + for (aux = req->misc.write.next; aux; aux = next) { + next = aux->misc.write.next; + aux->misc.write.next = NULL; + fuse_writepage_free(fc, aux); + fuse_put_request(fc, aux); + } + fuse_writepage_free(fc, req); fuse_put_request(fc, req); - spin_lock(&fc->lock); + spin_lock(&fi->lock); } /* * If fi->writectr is positive (no truncate or fsync going on) send * all queued writepage requests. * - * Called with fc->lock + * Called with fi->lock */ void fuse_flush_writepages(struct inode *inode) -__releases(fc->lock) -__acquires(fc->lock) +__releases(fi->lock) +__acquires(fi->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -1546,7 +1602,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) struct fuse_inode *fi = get_fuse_inode(inode); mapping_set_error(inode->i_mapping, req->out.h.error); - spin_lock(&fc->lock); + spin_lock(&fi->lock); while (req->misc.write.next) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_write_in *inarg = &req->misc.write.in; @@ -1583,7 +1639,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) } fi->writectr--; fuse_writepage_finish(fc, req); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_writepage_free(fc, req); } @@ -1592,13 +1648,13 @@ static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, { struct fuse_file *ff = NULL; - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (!list_empty(&fi->write_files)) { ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); fuse_file_get(ff); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); return ff; } @@ -1669,11 +1725,11 @@ static int fuse_writepage_locked(struct page *page) inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_add(&req->writepages_entry, &fi->writepages); list_add_tail(&req->list, &fi->queued_writes); fuse_flush_writepages(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); end_page_writeback(page); @@ -1722,21 +1778,27 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) { struct fuse_req *req = data->req; struct inode *inode = data->inode; - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); int num_pages = req->num_pages; int i; req->ff = fuse_file_get(data->ff); - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_add_tail(&req->list, &fi->queued_writes); fuse_flush_writepages(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); for (i = 0; i < num_pages; i++) end_page_writeback(data->orig_pages[i]); } +/* + * First recheck under fi->lock if the offending offset is still under + * writeback. If yes, then iterate auxiliary write requests, to see if there's + * one already added for a page at this offset. If there's none, then insert + * this new request onto the auxiliary list, otherwise reuse the existing one by + * copying the new page contents over to the old temporary page. + */ static bool fuse_writepage_in_flight(struct fuse_req *new_req, struct page *page) { @@ -1744,57 +1806,50 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, struct fuse_inode *fi = get_fuse_inode(new_req->inode); struct fuse_req *tmp; struct fuse_req *old_req; - bool found = false; - pgoff_t curr_index; - BUG_ON(new_req->num_pages != 0); + WARN_ON(new_req->num_pages != 0); - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_del(&new_req->writepages_entry); - list_for_each_entry(old_req, &fi->writepages, writepages_entry) { - BUG_ON(old_req->inode != new_req->inode); - curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT; - if (curr_index <= page->index && - page->index < curr_index + old_req->num_pages) { - found = true; - break; - } - } - if (!found) { + old_req = fuse_find_writeback(fi, page->index, page->index); + if (!old_req) { list_add(&new_req->writepages_entry, &fi->writepages); - goto out_unlock; + spin_unlock(&fi->lock); + return false; } new_req->num_pages = 1; - for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) { - BUG_ON(tmp->inode != new_req->inode); + for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) { + pgoff_t curr_index; + + WARN_ON(tmp->inode != new_req->inode); curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT; - if (tmp->num_pages == 1 && - curr_index == page->index) { - old_req = tmp; + if (curr_index == page->index) { + WARN_ON(tmp->num_pages != 1); + WARN_ON(!test_bit(FR_PENDING, &tmp->flags)); + swap(tmp->pages[0], new_req->pages[0]); + break; } } - if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) { - struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); + if (!tmp) { + new_req->misc.write.next = old_req->misc.write.next; + old_req->misc.write.next = new_req; + } + + spin_unlock(&fi->lock); - copy_highpage(old_req->pages[0], page); - spin_unlock(&fc->lock); + if (tmp) { + struct backing_dev_info *bdi = inode_to_bdi(new_req->inode); dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); - goto out; - } else { - new_req->misc.write.next = old_req->misc.write.next; - old_req->misc.write.next = new_req; } -out_unlock: - spin_unlock(&fc->lock); -out: - return found; + + return true; } static int fuse_writepages_fill(struct page *page, @@ -1803,6 +1858,7 @@ static int fuse_writepages_fill(struct page *page, struct fuse_fill_wb_data *data = _data; struct fuse_req *req = data->req; struct inode *inode = data->inode; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); struct page *tmp_page; bool is_writeback; @@ -1873,9 +1929,9 @@ static int fuse_writepages_fill(struct page *page, req->end = fuse_writepage_end; req->inode = inode; - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_add(&req->writepages_entry, &fi->writepages); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); data->req = req; } @@ -1898,12 +1954,12 @@ static int fuse_writepages_fill(struct page *page, data->orig_pages[req->num_pages] = page; /* - * Protected by fc->lock against concurrent access by + * Protected by fi->lock against concurrent access by * fuse_page_is_writeback(). */ - spin_lock(&fc->lock); + spin_lock(&fi->lock); req->num_pages++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); out_unlock: unlock_page(page); @@ -2087,6 +2143,18 @@ static const struct vm_operations_struct fuse_file_vm_ops = { static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_DIRECT_IO) { + /* Can't provide the coherency needed for MAP_SHARED */ + if (vma->vm_flags & VM_MAYSHARE) + return -ENODEV; + + invalidate_inode_pages2(file->f_mapping); + + return generic_file_mmap(file, vma); + } + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) fuse_link_write_file(file); @@ -2095,17 +2163,6 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) -{ - /* Can't provide the coherency needed for MAP_SHARED */ - if (vma->vm_flags & VM_MAYSHARE) - return -ENODEV; - - invalidate_inode_pages2(file->f_mapping); - - return generic_file_mmap(file, vma); -} - static int convert_fuse_file_lock(struct fuse_conn *fc, const struct fuse_file_lock *ffl, struct file_lock *fl) @@ -3114,6 +3171,7 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .flock = fuse_file_flock, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, @@ -3121,24 +3179,6 @@ static const struct file_operations fuse_file_operations = { .copy_file_range = fuse_copy_file_range, }; -static const struct file_operations fuse_direct_io_file_operations = { - .llseek = fuse_file_llseek, - .read_iter = fuse_direct_read_iter, - .write_iter = fuse_direct_write_iter, - .mmap = fuse_direct_mmap, - .open = fuse_open, - .flush = fuse_flush, - .release = fuse_release, - .fsync = fuse_fsync, - .lock = fuse_file_lock, - .flock = fuse_file_flock, - .unlocked_ioctl = fuse_file_ioctl, - .compat_ioctl = fuse_file_compat_ioctl, - .poll = fuse_file_poll, - .fallocate = fuse_file_fallocate, - /* no splice_read */ -}; - static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, .writepage = fuse_writepage, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2f2c92e6f8cb..0920c0c032a0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -96,7 +96,7 @@ struct fuse_inode { union { /* Write related fields (regular file only) */ struct { - /* Files usable in writepage. Protected by fc->lock */ + /* Files usable in writepage. Protected by fi->lock */ struct list_head write_files; /* Writepages pending on truncate or fsync */ @@ -144,6 +144,9 @@ struct fuse_inode { /** Lock for serializing lookup and readdir for back compatibility*/ struct mutex mutex; + + /** Lock to protect write related fields */ + spinlock_t lock; }; /** FUSE inode state bits */ @@ -163,7 +166,10 @@ struct fuse_file { /** Fuse connection for this file */ struct fuse_conn *fc; - /** Request reserved for flush and release */ + /* + * Request reserved for flush and release. + * Modified under relative fuse_inode::lock. + */ struct fuse_req *reserved_req; /** Kernel file handle guaranteed to be unique */ @@ -538,7 +544,7 @@ struct fuse_conn { struct fuse_iqueue iq; /** The next unique kernel file handle */ - u64 khctr; + atomic64_t khctr; /** rbtree of fuse_files waiting for poll events indexed by ph */ struct rb_root polled_files; @@ -624,6 +630,9 @@ struct fuse_conn { /** Is open/release not implemented by fs? */ unsigned no_open:1; + /** Is opendir/releasedir not implemented by fs? */ + unsigned no_opendir:1; + /** Is fsync not implemented by fs? */ unsigned no_fsync:1; @@ -730,7 +739,7 @@ struct fuse_conn { struct fuse_req *destroy_req; /** Version counter for attribute changes */ - u64 attr_version; + atomic64_t attr_version; /** Called on final put */ void (*release)(struct fuse_conn *); @@ -770,6 +779,11 @@ static inline int invalid_nodeid(u64 nodeid) return !nodeid || nodeid == FUSE_ROOT_ID; } +static inline u64 fuse_get_attr_version(struct fuse_conn *fc) +{ + return atomic64_read(&fc->attr_version); +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -817,7 +831,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); -void fuse_sync_release(struct fuse_file *ff, int flags); +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags); /** * Send RELEASE or RELEASEDIR request @@ -936,7 +950,7 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ -void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); +void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); /** @@ -1000,8 +1014,6 @@ void fuse_flush_writepages(struct inode *inode); void fuse_set_nowrite(struct inode *inode); void fuse_release_nowrite(struct inode *inode); -u64 fuse_get_attr_version(struct fuse_conn *fc); - /** * File-system tells the kernel to invalidate cache for the given node id. */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c2d4099429be..ec5d9953dfb6 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->orig_ino = 0; fi->state = 0; mutex_init(&fi->mutex); + spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); if (!fi->forget) { kmem_cache_free(fuse_inode_cachep, inode); @@ -163,7 +164,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - fi->attr_version = ++fc->attr_version; + lockdep_assert_held(&fi->lock); + + fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; WRITE_ONCE(fi->inval_mask, 0); @@ -209,10 +212,10 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, loff_t oldsize; struct timespec64 old_mtime; - spin_lock(&fc->lock); + spin_lock(&fi->lock); if ((attr_version != 0 && fi->attr_version > attr_version) || test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); return; } @@ -227,7 +230,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, */ if (!is_wb || !S_ISREG(inode->i_mode)) i_size_write(inode, attr->size); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); if (!is_wb && S_ISREG(inode->i_mode)) { bool inval = false; @@ -322,9 +325,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, } fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); fi->nlookup++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_change_attributes(inode, attr, attr_valid, attr_version); return inode; @@ -376,7 +379,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb), false); + fuse_abort_conn(get_fuse_conn_super(sb)); } static void fuse_send_destroy(struct fuse_conn *fc) @@ -619,12 +622,12 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; - fc->khctr = 0; + atomic64_set(&fc->khctr, 0); fc->polled_files = RB_ROOT; fc->blocked = 0; fc->initialized = 0; fc->connected = 1; - fc->attr_version = 1; + atomic64_set(&fc->attr_version, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); @@ -969,7 +972,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | - FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS; + FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | + FUSE_NO_OPENDIR_SUPPORT; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1010,7 +1014,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* fuse does it's own writeback accounting */ sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; @@ -1242,7 +1246,7 @@ static void fuse_sb_destroy(struct super_block *sb) if (fc) { fuse_send_destroy(fc); - fuse_abort_conn(fc, false); + fuse_abort_conn(fc); fuse_wait_aborted(fc); down_write(&fc->killsb); diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index ab18b78f4755..574d03f8a573 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -213,9 +213,9 @@ retry: } fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); fi->nlookup++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); forget_all_cached_acls(inode); fuse_change_attributes(inode, &o->attr, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b92740edc416..d32964cd1117 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -107,7 +107,7 @@ static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) { - u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0); + u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0); return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); } @@ -2131,71 +2131,29 @@ static const struct file_operations gfs2_sbstats_fops = { .release = seq_release, }; -int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) -{ - struct dentry *dent; - - dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root); - if (IS_ERR_OR_NULL(dent)) - goto fail; - sdp->debugfs_dir = dent; - - dent = debugfs_create_file("glocks", - S_IFREG | S_IRUGO, - sdp->debugfs_dir, sdp, - &gfs2_glocks_fops); - if (IS_ERR_OR_NULL(dent)) - goto fail; - sdp->debugfs_dentry_glocks = dent; - - dent = debugfs_create_file("glstats", - S_IFREG | S_IRUGO, - sdp->debugfs_dir, sdp, - &gfs2_glstats_fops); - if (IS_ERR_OR_NULL(dent)) - goto fail; - sdp->debugfs_dentry_glstats = dent; - - dent = debugfs_create_file("sbstats", - S_IFREG | S_IRUGO, - sdp->debugfs_dir, sdp, - &gfs2_sbstats_fops); - if (IS_ERR_OR_NULL(dent)) - goto fail; - sdp->debugfs_dentry_sbstats = dent; +void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) +{ + sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root); - return 0; -fail: - gfs2_delete_debugfs_file(sdp); - return dent ? PTR_ERR(dent) : -ENOMEM; + debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_glocks_fops); + + debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_glstats_fops); + + debugfs_create_file("sbstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_sbstats_fops); } void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) { - if (sdp->debugfs_dir) { - if (sdp->debugfs_dentry_glocks) { - debugfs_remove(sdp->debugfs_dentry_glocks); - sdp->debugfs_dentry_glocks = NULL; - } - if (sdp->debugfs_dentry_glstats) { - debugfs_remove(sdp->debugfs_dentry_glstats); - sdp->debugfs_dentry_glstats = NULL; - } - if (sdp->debugfs_dentry_sbstats) { - debugfs_remove(sdp->debugfs_dentry_sbstats); - sdp->debugfs_dentry_sbstats = NULL; - } - debugfs_remove(sdp->debugfs_dir); - sdp->debugfs_dir = NULL; - } + debugfs_remove_recursive(sdp->debugfs_dir); + sdp->debugfs_dir = NULL; } -int gfs2_register_debugfs(void) +void gfs2_register_debugfs(void) { gfs2_root = debugfs_create_dir("gfs2", NULL); - if (IS_ERR(gfs2_root)) - return PTR_ERR(gfs2_root); - return gfs2_root ? 0 : -ENOMEM; } void gfs2_unregister_debugfs(void) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 8949bf28b249..936b3295839c 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -243,9 +243,9 @@ extern void gfs2_glock_free(struct gfs2_glock *gl); extern int __init gfs2_glock_init(void); extern void gfs2_glock_exit(void); -extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); +extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp); extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); -extern int gfs2_register_debugfs(void); +extern void gfs2_register_debugfs(void); extern void gfs2_unregister_debugfs(void); extern const struct lm_lockops gfs2_dlm_ops; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e10e0b0a7cd5..cdf07b408f54 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -853,9 +853,6 @@ struct gfs2_sbd { unsigned long sd_last_warning; struct dentry *debugfs_dir; /* debugfs directory */ - struct dentry *debugfs_dentry_glocks; - struct dentry *debugfs_dentry_glstats; - struct dentry *debugfs_dentry_sbstats; }; static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 793808263c6d..18d4af7417fa 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -59,8 +59,8 @@ static inline u64 gfs2_get_inode_blocks(const struct inode *inode) static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change) { - gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change)); - change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK); + change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT; + gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change)); inode->i_blocks += change; } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index c7603063f861..136484ef35d3 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -178,16 +178,12 @@ static int __init init_gfs2_fs(void) if (!gfs2_page_pool) goto fail_mempool; - error = gfs2_register_debugfs(); - if (error) - goto fail_debugfs; + gfs2_register_debugfs(); pr_info("GFS2 installed\n"); return 0; -fail_debugfs: - mempool_destroy(gfs2_page_pool); fail_mempool: destroy_workqueue(gfs2_freeze_wq); fail_wq3: diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index 823a328791c0..302f45101a96 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -120,11 +120,11 @@ struct hpfs_spare_block u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ u8 bad_bitmap: 1; /* bad bitmap */ u8 fast: 1; /* partition was fast formatted */ - u8 old_wrote: 1; /* old version wrote to partion */ - u8 old_wrote_1: 1; /* old version wrote to partion (?) */ + u8 old_wrote: 1; /* old version wrote to partition */ + u8 old_wrote_1: 1; /* old version wrote to partition (?) */ #else - u8 old_wrote_1: 1; /* old version wrote to partion (?) */ - u8 old_wrote: 1; /* old version wrote to partion */ + u8 old_wrote_1: 1; /* old version wrote to partition (?) */ + u8 old_wrote: 1; /* old version wrote to partition */ u8 fast: 1; /* partition was fast formatted */ u8 bad_bitmap: 1; /* bad bitmap */ u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b0eef008de67..9285dd4f4b1c 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -27,7 +27,7 @@ #include <linux/backing-dev.h> #include <linux/hugetlb.h> #include <linux/pagevec.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/dnotify.h> @@ -45,11 +45,17 @@ const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; -struct hugetlbfs_config { +enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; + +struct hugetlbfs_fs_context { struct hstate *hstate; + unsigned long long max_size_opt; + unsigned long long min_size_opt; long max_hpages; long nr_inodes; long min_hpages; + enum hugetlbfs_size_type max_val_type; + enum hugetlbfs_size_type min_val_type; kuid_t uid; kgid_t gid; umode_t mode; @@ -57,22 +63,30 @@ struct hugetlbfs_config { int sysctl_hugetlb_shm_group; -enum { - Opt_size, Opt_nr_inodes, - Opt_mode, Opt_uid, Opt_gid, - Opt_pagesize, Opt_min_size, - Opt_err, +enum hugetlb_param { + Opt_gid, + Opt_min_size, + Opt_mode, + Opt_nr_inodes, + Opt_pagesize, + Opt_size, + Opt_uid, }; -static const match_table_t tokens = { - {Opt_size, "size=%s"}, - {Opt_nr_inodes, "nr_inodes=%s"}, - {Opt_mode, "mode=%o"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_pagesize, "pagesize=%s"}, - {Opt_min_size, "min_size=%s"}, - {Opt_err, NULL}, +static const struct fs_parameter_spec hugetlb_param_specs[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_string("min_size", Opt_min_size), + fsparam_u32 ("mode", Opt_mode), + fsparam_string("nr_inodes", Opt_nr_inodes), + fsparam_string("pagesize", Opt_pagesize), + fsparam_string("size", Opt_size), + fsparam_u32 ("uid", Opt_uid), + {} +}; + +static const struct fs_parameter_description hugetlb_fs_parameters = { + .name = "hugetlbfs", + .specs = hugetlb_param_specs, }; #ifdef CONFIG_NUMA @@ -708,16 +722,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) } static struct inode *hugetlbfs_get_root(struct super_block *sb, - struct hugetlbfs_config *config) + struct hugetlbfs_fs_context *ctx) { struct inode *inode; inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_mode = S_IFDIR | config->mode; - inode->i_uid = config->uid; - inode->i_gid = config->gid; + inode->i_mode = S_IFDIR | ctx->mode; + inode->i_uid = ctx->uid; + inode->i_gid = ctx->gid; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -741,11 +755,17 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev) { struct inode *inode; - struct resv_map *resv_map; + struct resv_map *resv_map = NULL; - resv_map = resv_map_alloc(); - if (!resv_map) - return NULL; + /* + * Reserve maps are only needed for inodes that can have associated + * page allocations. + */ + if (S_ISREG(mode) || S_ISLNK(mode)) { + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; + } inode = new_inode(sb); if (inode) { @@ -780,8 +800,10 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); - } else - kref_put(&resv_map->refs, resv_map_release); + } else { + if (resv_map) + kref_put(&resv_map->refs, resv_map_release); + } return inode; } @@ -1093,8 +1115,6 @@ static const struct super_operations hugetlbfs_ops = { .show_options = hugetlbfs_show_options, }; -enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; - /* * Convert size option passed from command line to number of huge pages * in the pool specified by hstate. Size option could be in bytes @@ -1117,170 +1137,151 @@ hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, return size_opt; } -static int -hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) +/* + * Parse one mount parameter. + */ +static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p, *rest; - substring_t args[MAX_OPT_ARGS]; - int option; - unsigned long long max_size_opt = 0, min_size_opt = 0; - enum hugetlbfs_size_type max_val_type = NO_SIZE, min_val_type = NO_SIZE; - - if (!options) + struct hugetlbfs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + char *rest; + unsigned long ps; + int opt; + + opt = fs_parse(fc, &hugetlb_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + ctx->uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(ctx->uid)) + goto bad_val; return 0; - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; + case Opt_gid: + ctx->gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(ctx->gid)) + goto bad_val; + return 0; - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - goto bad_val; - pconfig->uid = make_kuid(current_user_ns(), option); - if (!uid_valid(pconfig->uid)) - goto bad_val; - break; + case Opt_mode: + ctx->mode = result.uint_32 & 01777U; + return 0; - case Opt_gid: - if (match_int(&args[0], &option)) - goto bad_val; - pconfig->gid = make_kgid(current_user_ns(), option); - if (!gid_valid(pconfig->gid)) - goto bad_val; - break; + case Opt_size: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->max_size_opt = memparse(param->string, &rest); + ctx->max_val_type = SIZE_STD; + if (*rest == '%') + ctx->max_val_type = SIZE_PERCENT; + return 0; - case Opt_mode: - if (match_octal(&args[0], &option)) - goto bad_val; - pconfig->mode = option & 01777U; - break; + case Opt_nr_inodes: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->nr_inodes = memparse(param->string, &rest); + return 0; - case Opt_size: { - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - max_size_opt = memparse(args[0].from, &rest); - max_val_type = SIZE_STD; - if (*rest == '%') - max_val_type = SIZE_PERCENT; - break; + case Opt_pagesize: + ps = memparse(param->string, &rest); + ctx->hstate = size_to_hstate(ps); + if (!ctx->hstate) { + pr_err("Unsupported page size %lu MB\n", ps >> 20); + return -EINVAL; } + return 0; - case Opt_nr_inodes: - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - pconfig->nr_inodes = memparse(args[0].from, &rest); - break; + case Opt_min_size: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->min_size_opt = memparse(param->string, &rest); + ctx->min_val_type = SIZE_STD; + if (*rest == '%') + ctx->min_val_type = SIZE_PERCENT; + return 0; - case Opt_pagesize: { - unsigned long ps; - ps = memparse(args[0].from, &rest); - pconfig->hstate = size_to_hstate(ps); - if (!pconfig->hstate) { - pr_err("Unsupported page size %lu MB\n", - ps >> 20); - return -EINVAL; - } - break; - } + default: + return -EINVAL; + } - case Opt_min_size: { - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - min_size_opt = memparse(args[0].from, &rest); - min_val_type = SIZE_STD; - if (*rest == '%') - min_val_type = SIZE_PERCENT; - break; - } +bad_val: + return invalf(fc, "hugetlbfs: Bad value '%s' for mount option '%s'\n", + param->string, param->key); +} - default: - pr_err("Bad mount option: \"%s\"\n", p); - return -EINVAL; - break; - } - } +/* + * Validate the parsed options. + */ +static int hugetlbfs_validate(struct fs_context *fc) +{ + struct hugetlbfs_fs_context *ctx = fc->fs_private; /* * Use huge page pool size (in hstate) to convert the size * options to number of huge pages. If NO_SIZE, -1 is returned. */ - pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, - max_size_opt, max_val_type); - pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, - min_size_opt, min_val_type); + ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate, + ctx->max_size_opt, + ctx->max_val_type); + ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate, + ctx->min_size_opt, + ctx->min_val_type); /* * If max_size was specified, then min_size must be smaller */ - if (max_val_type > NO_SIZE && - pconfig->min_hpages > pconfig->max_hpages) { - pr_err("minimum size can not be greater than maximum size\n"); + if (ctx->max_val_type > NO_SIZE && + ctx->min_hpages > ctx->max_hpages) { + pr_err("Minimum size can not be greater than maximum size\n"); return -EINVAL; } return 0; - -bad_val: - pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); - return -EINVAL; } static int -hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) +hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) { - int ret; - struct hugetlbfs_config config; + struct hugetlbfs_fs_context *ctx = fc->fs_private; struct hugetlbfs_sb_info *sbinfo; - config.max_hpages = -1; /* No limit on size by default */ - config.nr_inodes = -1; /* No limit on number of inodes by default */ - config.uid = current_fsuid(); - config.gid = current_fsgid(); - config.mode = 0755; - config.hstate = &default_hstate; - config.min_hpages = -1; /* No default minimum size */ - ret = hugetlbfs_parse_options(data, &config); - if (ret) - return ret; - sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); if (!sbinfo) return -ENOMEM; sb->s_fs_info = sbinfo; - sbinfo->hstate = config.hstate; spin_lock_init(&sbinfo->stat_lock); - sbinfo->max_inodes = config.nr_inodes; - sbinfo->free_inodes = config.nr_inodes; - sbinfo->spool = NULL; - sbinfo->uid = config.uid; - sbinfo->gid = config.gid; - sbinfo->mode = config.mode; + sbinfo->hstate = ctx->hstate; + sbinfo->max_inodes = ctx->nr_inodes; + sbinfo->free_inodes = ctx->nr_inodes; + sbinfo->spool = NULL; + sbinfo->uid = ctx->uid; + sbinfo->gid = ctx->gid; + sbinfo->mode = ctx->mode; /* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimim size) are taken * taken when the subpool is created. */ - if (config.max_hpages != -1 || config.min_hpages != -1) { - sbinfo->spool = hugepage_new_subpool(config.hstate, - config.max_hpages, - config.min_hpages); + if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { + sbinfo->spool = hugepage_new_subpool(ctx->hstate, + ctx->max_hpages, + ctx->min_hpages); if (!sbinfo->spool) goto out_free; } sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = huge_page_size(config.hstate); - sb->s_blocksize_bits = huge_page_shift(config.hstate); + sb->s_blocksize = huge_page_size(ctx->hstate); + sb->s_blocksize_bits = huge_page_shift(ctx->hstate); sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; - sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); + sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx)); if (!sb->s_root) goto out_free; return 0; @@ -1290,16 +1291,52 @@ out_free: return -ENOMEM; } -static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hugetlbfs_get_tree(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); + int err = hugetlbfs_validate(fc); + if (err) + return err; + return vfs_get_super(fc, vfs_get_independent_super, hugetlbfs_fill_super); +} + +static void hugetlbfs_fs_context_free(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations hugetlbfs_fs_context_ops = { + .free = hugetlbfs_fs_context_free, + .parse_param = hugetlbfs_parse_param, + .get_tree = hugetlbfs_get_tree, +}; + +static int hugetlbfs_init_fs_context(struct fs_context *fc) +{ + struct hugetlbfs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max_hpages = -1; /* No limit on size by default */ + ctx->nr_inodes = -1; /* No limit on number of inodes by default */ + ctx->uid = current_fsuid(); + ctx->gid = current_fsgid(); + ctx->mode = 0755; + ctx->hstate = &default_hstate; + ctx->min_hpages = -1; /* No default minimum size */ + ctx->max_val_type = NO_SIZE; + ctx->min_val_type = NO_SIZE; + fc->fs_private = ctx; + fc->ops = &hugetlbfs_fs_context_ops; + return 0; } static struct file_system_type hugetlbfs_fs_type = { - .name = "hugetlbfs", - .mount = hugetlbfs_mount, - .kill_sb = kill_litter_super, + .name = "hugetlbfs", + .init_fs_context = hugetlbfs_init_fs_context, + .parameters = &hugetlb_fs_parameters, + .kill_sb = kill_litter_super, }; static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1384,8 +1421,29 @@ out: return file; } +static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) +{ + struct fs_context *fc; + struct vfsmount *mnt; + + fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT); + if (IS_ERR(fc)) { + mnt = ERR_CAST(fc); + } else { + struct hugetlbfs_fs_context *ctx = fc->fs_private; + ctx->hstate = h; + mnt = fc_mount(fc); + put_fs_context(fc); + } + if (IS_ERR(mnt)) + pr_err("Cannot mount internal hugetlbfs for page size %uK", + 1U << (h->order + PAGE_SHIFT - 10)); + return mnt; +} + static int __init init_hugetlbfs_fs(void) { + struct vfsmount *mnt; struct hstate *h; int error; int i; @@ -1408,24 +1466,16 @@ static int __init init_hugetlbfs_fs(void) i = 0; for_each_hstate(h) { - char buf[50]; - unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); - - snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); - hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, - buf); - - if (IS_ERR(hugetlbfs_vfsmount[i])) { - pr_err("Cannot mount internal hugetlbfs for " - "page size %uK", ps_kb); - error = PTR_ERR(hugetlbfs_vfsmount[i]); - hugetlbfs_vfsmount[i] = NULL; + mnt = mount_one_hugetlbfs(h); + if (IS_ERR(mnt) && i == 0) { + error = PTR_ERR(mnt); + goto out; } + hugetlbfs_vfsmount[i] = mnt; i++; } - /* Non default hstates are optional */ - if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) - return 0; + + return 0; out: kmem_cache_destroy(hugetlbfs_inode_cachep); diff --git a/fs/inode.c b/fs/inode.c index e9d97add2b36..9a453f3637f8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1817,8 +1817,13 @@ int file_remove_privs(struct file *file) int kill; int error = 0; - /* Fast path for nothing security related */ - if (IS_NOSEC(inode)) + /* + * Fast path for nothing security related. + * As well for non-regular files, e.g. blkdev inodes. + * For example, blkdev_write_iter() might get here + * trying to remove privs which it is not allowed to. + */ + if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(dentry); diff --git a/fs/internal.h b/fs/internal.h index d410186bc369..6a8b71643af4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -17,6 +17,7 @@ struct linux_binprm; struct path; struct mount; struct shrink_control; +struct fs_context; /* * block_dev.c @@ -52,8 +53,16 @@ int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, extern void __init chrdev_init(void); /* + * fs_context.c + */ +extern int parse_monolithic_mount_data(struct fs_context *, void *); +extern void fc_drop_locked(struct fs_context *); + +/* * namei.c */ +extern int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); @@ -99,10 +108,8 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *); /* * super.c */ -extern int do_remount_sb(struct super_block *, int, void *, int); +extern int reconfigure_super(struct fs_context *); extern bool trylock_super(struct super_block *sb); -extern struct dentry *mount_fs(struct file_system_type *, - int, const char *, void *); extern struct super_block *user_get_super(dev_t); /* diff --git a/fs/io_uring.c b/fs/io_uring.c index 5d99376d2369..84efb8956734 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4,15 +4,28 @@ * supporting fast/efficient IO. * * A note on the read/write ordering memory barriers that are matched between - * the application and kernel side. When the application reads the CQ ring - * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() - * the kernel uses after writing the tail. Failure to do so could cause a - * delay in when the application notices that completion events available. - * This isn't a fatal condition. Likewise, the application must use an - * appropriate smp_wmb() both before writing the SQ tail, and after writing - * the SQ tail. The first one orders the sqe writes with the tail write, and - * the latter is paired with the smp_rmb() the kernel will issue before - * reading the SQ tail on submission. + * the application and kernel side. + * + * After the application reads the CQ ring tail, it must use an + * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses + * before writing the tail (using smp_load_acquire to read the tail will + * do). It also needs a smp_mb() before updating CQ head (ordering the + * entry load(s) with the head store), pairing with an implicit barrier + * through a control-dependency in io_get_cqring (smp_store_release to + * store head will do). Failure to do so could lead to reading invalid + * CQ entries. + * + * Likewise, the application must use an appropriate smp_wmb() before + * writing the SQ tail (ordering SQ entry stores with the tail store), + * which pairs with smp_load_acquire in io_get_sqring (smp_store_release + * to store the tail will do). And it needs a barrier ordering the SQ + * head load before writing new SQ entries (smp_load_acquire to read + * head will do). + * + * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application + * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* + * updating the SQ tail; a full memory barrier smp_mb() is needed + * between. * * Also see the examples in the liburing library: * @@ -70,20 +83,108 @@ struct io_uring { u32 tail ____cacheline_aligned_in_smp; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_SQ_RING. + * + * The offsets to the member fields are published through struct + * io_sqring_offsets when calling io_uring_setup. + */ struct io_sq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The kernel controls head and the application controls tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of invalid entries dropped by the kernel due to + * invalid index stored in array + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * After a new SQ head value was read by the application this + * counter includes all submissions that were dropped reaching + * the new SQ head (and possibly more). + */ u32 dropped; + /* + * Runtime flags + * + * Written by the kernel, shouldn't be modified by the + * application. + * + * The application needs a full memory barrier before checking + * for IORING_SQ_NEED_WAKEUP after updating the sq tail. + */ u32 flags; + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ u32 array[]; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_CQ_RING. + * + * The offsets to the member fields are published through struct + * io_cqring_offsets when calling io_uring_setup. + */ struct io_cq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The application controls head and the kernel tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of completion events lost because the queue was full; + * this should be avoided by the application by making sure + * there are not more requests pending thatn there is space in + * the completion queue. + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * As completion events come in out of order this counter is not + * ordered with any other data. + */ u32 overflow; + /* + * Ring buffer of completion events. + * + * The kernel writes completion events fresh every time they are + * produced, so the application is allowed to modify pending + * entries. + */ struct io_uring_cqe cqes[]; }; @@ -189,17 +290,28 @@ struct sqe_submit { bool needs_fixed_file; }; +/* + * First field must be the file pointer in all the + * iocb unions! See also 'struct kiocb' in <linux/fs.h> + */ struct io_poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool woken; + bool done; bool canceled; struct wait_queue_entry wait; }; +/* + * NOTE! Each of the iocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'ki_filp' in this struct. + */ struct io_kiocb { union { + struct file *file; struct kiocb rw; struct io_poll_iocb poll; }; @@ -210,10 +322,11 @@ struct io_kiocb { struct list_head list; unsigned int flags; refcount_t refs; -#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ +#define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ #define REQ_F_SEQ_PREV 8 /* sequential with previous */ +#define REQ_F_PREPPED 16 /* prep already done */ u64 user_data; u64 error; @@ -305,12 +418,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) /* order cqe stores with ring update */ smp_store_release(&ring->r.tail, ctx->cached_cq_tail); - /* - * Write sider barrier of tail update, app has read side. See - * comment at the top of this file. - */ - smp_wmb(); - if (wq_has_sleeper(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); @@ -324,9 +431,12 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) unsigned tail; tail = ctx->cached_cq_tail; - /* See comment at the top of the file */ - smp_rmb(); - if (tail + 1 == READ_ONCE(ring->r.head)) + /* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ + if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) return NULL; ctx->cached_cq_tail++; @@ -355,20 +465,25 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, } } -static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); + if (waitqueue_active(&ctx->sqo_wait)) + wake_up(&ctx->sqo_wait); +} + +static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, long res, unsigned ev_flags) { unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - io_cqring_fill_event(ctx, ki_user_data, res, ev_flags); + io_cqring_fill_event(ctx, user_data, res, ev_flags); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); - if (waitqueue_active(&ctx->sqo_wait)) - wake_up(&ctx->sqo_wait); + io_cqring_ev_posted(ctx); } static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) @@ -382,13 +497,14 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; if (!percpu_ref_tryget(&ctx->refs)) return NULL; if (!state) { - req = kmem_cache_alloc(req_cachep, __GFP_NOWARN); + req = kmem_cache_alloc(req_cachep, gfp); if (unlikely(!req)) goto out; } else if (!state->free_reqs) { @@ -396,10 +512,18 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, int ret; sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs)); - ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz, - state->reqs); - if (unlikely(ret <= 0)) - goto out; + ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs); + + /* + * Bulk alloc is all-or-nothing. If we fail to get a batch, + * retry single alloc to be on the safe side. + */ + if (unlikely(ret <= 0)) { + state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!state->reqs[0]) + goto out; + ret = 1; + } state->free_reqs = ret - 1; state->cur_req = 1; req = state->reqs[0]; @@ -411,7 +535,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->ctx = ctx; req->flags = 0; - refcount_set(&req->refs, 0); + /* one is dropped after submission, the other at completion */ + refcount_set(&req->refs, 2); return req; out: io_ring_drop_ctx_refs(ctx, 1); @@ -429,10 +554,16 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) static void io_free_req(struct io_kiocb *req) { - if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) { - io_ring_drop_ctx_refs(req->ctx, 1); - kmem_cache_free(req_cachep, req); - } + if (req->file && !(req->flags & REQ_F_FIXED_FILE)) + fput(req->file); + io_ring_drop_ctx_refs(req->ctx, 1); + kmem_cache_free(req_cachep, req); +} + +static void io_put_req(struct io_kiocb *req) +{ + if (refcount_dec_and_test(&req->refs)) + io_free_req(req); } /* @@ -442,44 +573,34 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, struct list_head *done) { void *reqs[IO_IOPOLL_BATCH]; - int file_count, to_free; - struct file *file = NULL; struct io_kiocb *req; + int to_free; - file_count = to_free = 0; + to_free = 0; while (!list_empty(done)) { req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); io_cqring_fill_event(ctx, req->user_data, req->error, 0); - - reqs[to_free++] = req; (*nr_events)++; - /* - * Batched puts of the same file, to avoid dirtying the - * file usage count multiple times, if avoidable. - */ - if (!(req->flags & REQ_F_FIXED_FILE)) { - if (!file) { - file = req->rw.ki_filp; - file_count = 1; - } else if (file == req->rw.ki_filp) { - file_count++; + if (refcount_dec_and_test(&req->refs)) { + /* If we're not using fixed files, we have to pair the + * completion part with the file put. Use regular + * completions for those, only batch free for fixed + * file. + */ + if (req->flags & REQ_F_FIXED_FILE) { + reqs[to_free++] = req; + if (to_free == ARRAY_SIZE(reqs)) + io_free_req_many(ctx, reqs, &to_free); } else { - fput_many(file, file_count); - file = req->rw.ki_filp; - file_count = 1; + io_free_req(req); } } - - if (to_free == ARRAY_SIZE(reqs)) - io_free_req_many(ctx, reqs, &to_free); } - io_commit_cqring(ctx); - if (file) - fput_many(file, file_count); + io_commit_cqring(ctx); io_free_req_many(ctx, reqs, &to_free); } @@ -602,21 +723,14 @@ static void kiocb_end_write(struct kiocb *kiocb) } } -static void io_fput(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_FIXED_FILE)) - fput(req->rw.ki_filp); -} - static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); kiocb_end_write(kiocb); - io_fput(req); io_cqring_add_event(req->ctx, req->user_data, res, 0); - io_free_req(req); + io_put_req(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -666,11 +780,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_add_tail(&req->list, &ctx->poll_list); } -static void io_file_put(struct io_submit_state *state, struct file *file) +static void io_file_put(struct io_submit_state *state) { - if (!state) { - fput(file); - } else if (state->file) { + if (state->file) { int diff = state->has_refs - state->used_refs; if (diff) @@ -695,7 +807,7 @@ static struct file *io_file_get(struct io_submit_state *state, int fd) state->ios_left--; return state->file; } - io_file_put(state, NULL); + io_file_put(state); } state->file = fget_many(fd, state->ios_left); if (!state->file) @@ -726,36 +838,23 @@ static bool io_file_supports_async(struct file *file) } static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { const struct io_uring_sqe *sqe = s->sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; - unsigned ioprio, flags; - int fd, ret; + unsigned ioprio; + int ret; + if (!req->file) + return -EBADF; /* For -EAGAIN retry, everything is already prepped */ - if (kiocb->ki_filp) + if (req->flags & REQ_F_PREPPED) return 0; - flags = READ_ONCE(sqe->flags); - fd = READ_ONCE(sqe->fd); + if (force_nonblock && !io_file_supports_async(req->file)) + force_nonblock = false; - if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->user_files || - (unsigned) fd >= ctx->nr_user_files)) - return -EBADF; - kiocb->ki_filp = ctx->user_files[fd]; - req->flags |= REQ_F_FIXED_FILE; - } else { - if (s->needs_fixed_file) - return -EBADF; - kiocb->ki_filp = io_file_get(state, fd); - if (unlikely(!kiocb->ki_filp)) - return -EBADF; - if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) - force_nonblock = false; - } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -764,7 +863,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, if (ioprio) { ret = ioprio_check_cap(ioprio); if (ret) - goto out_fput; + return ret; kiocb->ki_ioprio = ioprio; } else @@ -772,38 +871,30 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) - goto out_fput; - if (force_nonblock) { + return ret; + + /* don't allow async punt if RWF_NOWAIT was requested */ + if (kiocb->ki_flags & IOCB_NOWAIT) + req->flags |= REQ_F_NOWAIT; + + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; - req->flags |= REQ_F_FORCE_NONBLOCK; - } + if (ctx->flags & IORING_SETUP_IOPOLL) { - ret = -EOPNOTSUPP; if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) - goto out_fput; + return -EOPNOTSUPP; req->error = 0; kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; } else { - if (kiocb->ki_flags & IOCB_HIPRI) { - ret = -EINVAL; - goto out_fput; - } + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; kiocb->ki_complete = io_complete_rw; } + req->flags |= REQ_F_PREPPED; return 0; -out_fput: - if (!(flags & IOSQE_FIXED_FILE)) { - /* - * in case of error, we didn't use this file reference. drop it. - */ - if (state) - state->used_refs--; - io_file_put(state, kiocb->ki_filp); - } - return ret; } static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) @@ -864,6 +955,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); if (offset) iov_iter_advance(iter, offset); + + /* don't drop a reference to these pages */ + iter->type |= ITER_BVEC_FLAG_NO_REF; return 0; } @@ -887,7 +981,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw, opcode = READ_ONCE(sqe->opcode); if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); + int ret = io_import_fixed(ctx, rw, sqe, iter); *iovec = NULL; return ret; } @@ -923,7 +1017,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) /* Use 8x RA size as a decent limiter for both reads/writes */ max_pages = filp->f_ra.ra_pages; if (!max_pages) - max_pages = VM_MAX_READAHEAD >> (PAGE_SHIFT - 10); + max_pages = VM_READAHEAD_PAGES; max_pages *= 8; /* If max pages are exceeded, reset the state */ @@ -945,31 +1039,29 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) async_list->io_end = io_end; } -static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) +static int io_read(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; + int ret; - ret = io_prep_rw(req, s, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock); if (ret) return ret; file = kiocb->ki_filp; - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) - goto out_fput; - ret = -EINVAL; + return -EBADF; if (unlikely(!file->f_op->read_iter)) - goto out_fput; + return -EINVAL; ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); if (ret) - goto out_fput; + return ret; iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); @@ -991,38 +1083,32 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, } } kfree(iovec); -out_fput: - /* Hold on to the file for -EAGAIN */ - if (unlikely(ret && ret != -EAGAIN)) - io_fput(req); return ret; } -static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) +static int io_write(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; + int ret; - ret = io_prep_rw(req, s, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock); if (ret) return ret; - ret = -EBADF; file = kiocb->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out_fput; - ret = -EINVAL; + return -EBADF; if (unlikely(!file->f_op->write_iter)) - goto out_fput; + return -EINVAL; ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); if (ret) - goto out_fput; + return ret; iov_count = iov_iter_count(&iter); @@ -1036,6 +1122,8 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); if (!ret) { + ssize_t ret2; + /* * Open-code file_start_write here to grab freeze protection, * which will be released by another thread in @@ -1050,14 +1138,22 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, SB_FREEZE_WRITE); } kiocb->ki_flags |= IOCB_WRITE; - io_rw_done(kiocb, call_write_iter(file, kiocb, &iter)); + + ret2 = call_write_iter(file, kiocb, &iter); + if (!force_nonblock || ret2 != -EAGAIN) { + io_rw_done(kiocb, ret2); + } else { + /* + * If ->needs_lock is true, we're already in async + * context. + */ + if (!s->needs_lock) + io_async_list_note(WRITE, req, iov_count); + ret = -EAGAIN; + } } out_free: kfree(iovec); -out_fput: - /* Hold on to the file for -EAGAIN */ - if (unlikely(ret && ret != -EAGAIN)) - io_fput(req); return ret; } @@ -1072,29 +1168,19 @@ static int io_nop(struct io_kiocb *req, u64 user_data) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - /* - * Twilight zone - it's possible that someone issued an opcode that - * has a file attached, then got -EAGAIN on submission, and changed - * the sqe before we retried it from async context. Avoid dropping - * a file reference for this malicious case, and flag the error. - */ - if (req->rw.ki_filp) { - err = -EBADF; - io_fput(req); - } io_cqring_add_event(ctx, user_data, err, 0); - io_free_req(req); + io_put_req(req); return 0; } static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags; - int fd; - /* Prep already done */ - if (req->rw.ki_filp) + if (!req->file) + return -EBADF; + /* Prep already done (EAGAIN retry) */ + if (req->flags & REQ_F_PREPPED) return 0; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) @@ -1102,20 +1188,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; - fd = READ_ONCE(sqe->fd); - flags = READ_ONCE(sqe->flags); - - if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) - return -EBADF; - req->rw.ki_filp = ctx->user_files[fd]; - req->flags |= REQ_F_FIXED_FILE; - } else { - req->rw.ki_filp = fget(fd); - if (unlikely(!req->rw.ki_filp)) - return -EBADF; - } - + req->flags |= REQ_F_PREPPED; return 0; } @@ -1144,9 +1217,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, end > 0 ? end : LLONG_MAX, fsync_flags & IORING_FSYNC_DATASYNC); - io_fput(req); io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); - io_free_req(req); + io_put_req(req); return 0; } @@ -1204,15 +1276,16 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); - io_free_req(req); + io_put_req(req); return 0; } -static void io_poll_complete(struct io_kiocb *req, __poll_t mask) +static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, + __poll_t mask) { - io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0); - io_fput(req); - io_free_req(req); + req->poll.done = true; + io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0); + io_commit_cqring(ctx); } static void io_poll_complete_work(struct work_struct *work) @@ -1240,9 +1313,11 @@ static void io_poll_complete_work(struct work_struct *work) return; } list_del_init(&req->list); + io_poll_complete(ctx, req, mask); spin_unlock_irq(&ctx->completion_lock); - io_poll_complete(req, mask); + io_cqring_ev_posted(ctx); + io_put_req(req); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -1253,29 +1328,25 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); - - poll->woken = true; + unsigned long flags; /* for instances that support it check for an event match first: */ - if (mask) { - unsigned long flags; + if (mask && !(mask & poll->events)) + return 0; - if (!(mask & poll->events)) - return 0; + list_del_init(&poll->wait.entry); - /* try to complete the iocb inline if we can: */ - if (spin_trylock_irqsave(&ctx->completion_lock, flags)) { - list_del(&req->list); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { + list_del(&req->list); + io_poll_complete(ctx, req, mask); + spin_unlock_irqrestore(&ctx->completion_lock, flags); - list_del_init(&poll->wait.entry); - io_poll_complete(req, mask); - return 1; - } + io_cqring_ev_posted(ctx); + io_put_req(req); + } else { + queue_work(ctx->sqo_wq, &req->work); } - list_del_init(&poll->wait.entry); - queue_work(ctx->sqo_wq, &req->work); return 1; } @@ -1305,36 +1376,23 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; - unsigned flags; + bool cancel = false; __poll_t mask; u16 events; - int fd; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) return -EINVAL; + if (!poll->file) + return -EBADF; INIT_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - flags = READ_ONCE(sqe->flags); - fd = READ_ONCE(sqe->fd); - - if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) - return -EBADF; - poll->file = ctx->user_files[fd]; - req->flags |= REQ_F_FIXED_FILE; - } else { - poll->file = fget(fd); - } - if (unlikely(!poll->file)) - return -EBADF; - poll->head = NULL; - poll->woken = false; + poll->done = false; poll->canceled = false; ipt.pt._qproc = io_poll_queue_proc; @@ -1346,56 +1404,43 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) INIT_LIST_HEAD(&poll->wait.entry); init_waitqueue_func_entry(&poll->wait, io_poll_wake); - /* one for removal from waitqueue, one for this function */ - refcount_set(&req->refs, 2); - mask = vfs_poll(poll->file, &ipt.pt) & poll->events; - if (unlikely(!poll->head)) { - /* we did not manage to set up a waitqueue, done */ - goto out; - } spin_lock_irq(&ctx->completion_lock); - spin_lock(&poll->head->lock); - if (poll->woken) { - /* wake_up context handles the rest */ - mask = 0; + if (likely(poll->head)) { + spin_lock(&poll->head->lock); + if (unlikely(list_empty(&poll->wait.entry))) { + if (ipt.error) + cancel = true; + ipt.error = 0; + mask = 0; + } + if (mask || ipt.error) + list_del_init(&poll->wait.entry); + else if (cancel) + WRITE_ONCE(poll->canceled, true); + else if (!poll->done) /* actually waiting for an event */ + list_add_tail(&req->list, &ctx->cancel_list); + spin_unlock(&poll->head->lock); + } + if (mask) { /* no async, we'd stolen it */ + req->error = mangle_poll(mask); ipt.error = 0; - } else if (mask || ipt.error) { - /* if we get an error or a mask we are done */ - WARN_ON_ONCE(list_empty(&poll->wait.entry)); - list_del_init(&poll->wait.entry); - } else { - /* actually waiting for an event */ - list_add_tail(&req->list, &ctx->cancel_list); + io_poll_complete(ctx, req, mask); } - spin_unlock(&poll->head->lock); spin_unlock_irq(&ctx->completion_lock); -out: - if (unlikely(ipt.error)) { - if (!(flags & IOSQE_FIXED_FILE)) - fput(poll->file); - /* - * Drop one of our refs to this req, __io_submit_sqe() will - * drop the other one since we're returning an error. - */ - io_free_req(req); - return ipt.error; + if (mask) { + io_cqring_ev_posted(ctx); + io_put_req(req); } - - if (mask) - io_poll_complete(req, mask); - io_free_req(req); - return 0; + return ipt.error; } static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct sqe_submit *s, bool force_nonblock, - struct io_submit_state *state) + const struct sqe_submit *s, bool force_nonblock) { - ssize_t ret; - int opcode; + int ret, opcode; if (unlikely(s->index >= ctx->sq_entries)) return -EINVAL; @@ -1409,18 +1454,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_READV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_read(req, s, force_nonblock, state); + ret = io_read(req, s, force_nonblock); break; case IORING_OP_WRITEV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_write(req, s, force_nonblock, state); + ret = io_write(req, s, force_nonblock); break; case IORING_OP_READ_FIXED: - ret = io_read(req, s, force_nonblock, state); + ret = io_read(req, s, force_nonblock); break; case IORING_OP_WRITE_FIXED: - ret = io_write(req, s, force_nonblock, state); + ret = io_write(req, s, force_nonblock); break; case IORING_OP_FSYNC: ret = io_fsync(req, s->sqe, force_nonblock); @@ -1493,8 +1538,7 @@ restart: struct sqe_submit *s = &req->submit; const struct io_uring_sqe *sqe = s->sqe; - /* Ensure we clear previously set forced non-block flag */ - req->flags &= ~REQ_F_FORCE_NONBLOCK; + /* Ensure we clear previously set non-block flag */ req->rw.ki_flags &= ~IOCB_NOWAIT; ret = 0; @@ -1513,7 +1557,7 @@ restart: s->has_user = cur_mm != NULL; s->needs_lock = true; do { - ret = __io_submit_sqe(ctx, req, s, false, NULL); + ret = __io_submit_sqe(ctx, req, s, false); /* * We can get EAGAIN for polled IO even though * we're forcing a sync submission from here, @@ -1525,9 +1569,13 @@ restart: cond_resched(); } while (1); } + + /* drop submission reference */ + io_put_req(req); + if (ret) { io_cqring_add_event(ctx, sqe->user_data, ret, 0); - io_free_req(req); + io_put_req(req); } /* async context always use a copy of the sqe */ @@ -1614,11 +1662,55 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) return ret; } +static bool io_op_needs_file(const struct io_uring_sqe *sqe) +{ + int op = READ_ONCE(sqe->opcode); + + switch (op) { + case IORING_OP_NOP: + case IORING_OP_POLL_REMOVE: + return false; + default: + return true; + } +} + +static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, + struct io_submit_state *state, struct io_kiocb *req) +{ + unsigned flags; + int fd; + + flags = READ_ONCE(s->sqe->flags); + fd = READ_ONCE(s->sqe->fd); + + if (!io_op_needs_file(s->sqe)) { + req->file = NULL; + return 0; + } + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || + (unsigned) fd >= ctx->nr_user_files)) + return -EBADF; + req->file = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + if (s->needs_fixed_file) + return -EBADF; + req->file = io_file_get(state, fd); + if (unlikely(!req->file)) + return -EBADF; + } + + return 0; +} + static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, struct io_submit_state *state) { struct io_kiocb *req; - ssize_t ret; + int ret; /* enforce forwards compatibility on users */ if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) @@ -1628,10 +1720,12 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, if (unlikely(!req)) return -EAGAIN; - req->rw.ki_filp = NULL; + ret = io_req_set_file(ctx, s, state, req); + if (unlikely(ret)) + goto out; - ret = __io_submit_sqe(ctx, req, s, true, state); - if (ret == -EAGAIN) { + ret = __io_submit_sqe(ctx, req, s, true); + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); @@ -1649,11 +1743,23 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, INIT_WORK(&req->work, io_sq_wq_submit_work); queue_work(ctx->sqo_wq, &req->work); } - ret = 0; + + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually + * submitted. + */ + return 0; } } + +out: + /* drop submission reference */ + io_put_req(req); + + /* and drop final reference, if we failed */ if (ret) - io_free_req(req); + io_put_req(req); return ret; } @@ -1664,7 +1770,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, static void io_submit_state_end(struct io_submit_state *state) { blk_finish_plug(&state->plug); - io_file_put(state, NULL); + io_file_put(state); if (state->free_reqs) kmem_cache_free_bulk(req_cachep, state->free_reqs, &state->reqs[state->cur_req]); @@ -1693,24 +1799,10 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * write new data to them. */ smp_store_release(&ring->r.head, ctx->cached_sq_head); - - /* - * write side barrier of head update, app has read side. See - * comment at the top of this file - */ - smp_wmb(); } } /* - * Undo last io_get_sqring() - */ -static void io_drop_sqring(struct io_ring_ctx *ctx) -{ - ctx->cached_sq_head--; -} - -/* * Fetch an sqe, if one is available. Note that s->sqe will point to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always @@ -1732,9 +1824,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) * though the application is the one updating it. */ head = ctx->cached_sq_head; - /* See comment at the top of this file */ - smp_rmb(); - if (head == READ_ONCE(ring->r.tail)) + /* make sure SQ entry isn't read before tail */ + if (head == smp_load_acquire(&ring->r.tail)) return false; head = READ_ONCE(ring->array[head & ctx->sq_mask]); @@ -1748,8 +1839,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) /* drop invalid entries */ ctx->cached_sq_head++; ring->dropped++; - /* See comment at the top of this file */ - smp_wmb(); return false; } @@ -1859,7 +1948,8 @@ static int io_sq_thread(void *data) /* Tell userspace we may need a wakeup call */ ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; - smp_wmb(); + /* make sure to read SQ tail after writing flags */ + smp_mb(); if (!io_get_sqring(ctx, &sqes[0])) { if (kthread_should_stop()) { @@ -1872,13 +1962,11 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); continue; } finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); } i = 0; @@ -1913,13 +2001,17 @@ static int io_sq_thread(void *data) unuse_mm(cur_mm); mmput(cur_mm); } + + if (kthread_should_park()) + kthread_parkme(); + return 0; } static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; - int i, ret = 0, submit = 0; + int i, submit = 0; if (to_submit > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, to_submit); @@ -1928,6 +2020,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) for (i = 0; i < to_submit; i++) { struct sqe_submit s; + int ret; if (!io_get_sqring(ctx, &s)) break; @@ -1935,21 +2028,18 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; + submit++; ret = io_submit_sqe(ctx, &s, statep); - if (ret) { - io_drop_sqring(ctx); - break; - } - - submit++; + if (ret) + io_cqring_add_event(ctx, s.sqe->user_data, ret, 0); } io_commit_sqring(ctx); if (statep) io_submit_state_end(statep); - return submit ? submit : ret; + return submit; } static unsigned io_cqring_events(struct io_cq_ring *ring) @@ -1975,7 +2065,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return 0; if (sig) { - ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, + &ksigmask, &sigsaved, sigsz); + else +#endif + ret = set_user_sigmask(sig, &ksigmask, + &sigsaved, sigsz); + if (ret) return ret; } @@ -2039,6 +2137,7 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) if (ctx->sqo_thread) { ctx->sqo_stop = 1; mb(); + kthread_park(ctx->sqo_thread); kthread_stop(ctx->sqo_thread); ctx->sqo_thread = NULL; } @@ -2200,6 +2299,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, fput(ctx->user_files[i]); kfree(ctx->user_files); + ctx->user_files = NULL; ctx->nr_user_files = 0; return ret; } @@ -2220,19 +2320,23 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, mmgrab(current->mm); ctx->sqo_mm = current->mm; - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); - if (!ctx->sq_thread_idle) - ctx->sq_thread_idle = HZ; + if (ctx->flags & IORING_SETUP_SQPOLL) { + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto err; - ret = -EINVAL; - if (!cpu_possible(p->sq_thread_cpu)) - goto err; + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); + if (!ctx->sq_thread_idle) + ctx->sq_thread_idle = HZ; - if (ctx->flags & IORING_SETUP_SQPOLL) { if (p->flags & IORING_SETUP_SQ_AFF) { - int cpu; + int cpu = array_index_nospec(p->sq_thread_cpu, + nr_cpu_ids); + + ret = -EINVAL; + if (!cpu_possible(cpu)) + goto err; - cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS); ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, ctx, cpu, "io_uring-sq"); @@ -2293,8 +2397,12 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) static void io_mem_free(void *ptr) { - struct page *page = virt_to_head_page(ptr); + struct page *page; + + if (!ptr) + return; + page = virt_to_head_page(ptr); if (put_page_testzero(page)) free_compound_page(page); } @@ -2335,7 +2443,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); - kfree(imu->bvec); + kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -2427,9 +2535,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, if (!pages || nr_pages > got_pages) { kfree(vmas); kfree(pages); - pages = kmalloc_array(nr_pages, sizeof(struct page *), + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - vmas = kmalloc_array(nr_pages, + vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), GFP_KERNEL); if (!pages || !vmas) { @@ -2441,7 +2549,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, got_pages = nr_pages; } - imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec), + imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { @@ -2480,6 +2588,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, } if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); + kvfree(imu->bvec); goto err; } @@ -2502,12 +2611,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ctx->nr_user_bufs++; } - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); return 0; err: - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); io_sqe_buffer_unregister(ctx); return ret; } @@ -2545,9 +2654,13 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) __poll_t mask = 0; poll_wait(file, &ctx->cq_wait, wait); - /* See comment at the top of this file */ + /* + * synchronizes with barrier from wq_has_sleeper call in + * io_commit_cqring + */ smp_rmb(); - if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) + if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != + ctx->sq_ring->ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; @@ -2658,24 +2771,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, mutex_lock(&ctx->uring_lock); submitted = io_ring_submit(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - - if (submitted < 0) - goto out_ctx; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; min_complete = min(min_complete, ctx->cq_entries); - /* - * The application could have included the 'to_submit' count - * in how many events it wanted to wait for. If we failed to - * submit the desired count, we may need to adjust the number - * of events to poll/wait for. - */ - if (submitted < to_submit) - min_complete = min_t(unsigned, submitted, min_complete); - if (ctx->flags & IORING_SETUP_IOPOLL) { mutex_lock(&ctx->uring_lock); ret = io_iopoll_check(ctx, &nr_events, min_complete); @@ -2721,17 +2822,12 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->sq_ring); + if (!ctx->sq_sqes) return -ENOMEM; - } cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); - if (!cq_ring) { - io_mem_free(ctx->sq_ring); - io_mem_free(ctx->sq_sqes); + if (!cq_ring) return -ENOMEM; - } ctx->cq_ring = cq_ring; cq_ring->ring_mask = p->cq_entries - 1; @@ -2902,11 +2998,31 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) + __releases(ctx->uring_lock) + __acquires(ctx->uring_lock) { int ret; + /* + * We're inside the ring mutex, if the ref is already dying, then + * someone else killed the ctx or is already going through + * io_uring_register(). + */ + if (percpu_ref_is_dying(&ctx->refs)) + return -ENXIO; + percpu_ref_kill(&ctx->refs); + + /* + * Drop uring mutex before waiting for references to exit. If another + * thread is currently inside io_uring_enter() it might need to grab + * the uring_lock to make progress. If we hold it here across the drain + * wait, then we can deadlock. It's safe to drop the mutex here, since + * no new references will come in after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); wait_for_completion(&ctx->ctx_done); + mutex_lock(&ctx->uring_lock); switch (opcode) { case IORING_REGISTER_BUFFERS: diff --git a/fs/iomap.c b/fs/iomap.c index 97cb9d486a7d..abdd18e404f8 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - struct bio_vec *bvec; - int i; - struct bvec_iter_all iter_all; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + int i; - bio_for_each_segment_all(bvec, bio, i, iter_all) - put_page(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter_all) + put_page(bvec->bv_page); + } bio_put(bio); } } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 26f8d7e46462..02e0b79753e7 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -113,7 +113,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) nblocks = jbd2_space_needed(journal); while (jbd2_log_space_left(journal) < nblocks) { write_unlock(&journal->j_state_lock); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); /* * Test again, another process may have checkpointed while we @@ -276,9 +276,22 @@ restart: "JBD2: %s: Waiting for Godot: block %llu\n", journal->j_devname, (unsigned long long) bh->b_blocknr); + if (batch_count) + __flush_batch(journal, &batch_count); jbd2_log_start_commit(journal, tid); + /* + * jbd2_journal_commit_transaction() may want + * to take the checkpoint_mutex if JBD2_FLUSHED + * is set, jbd2_update_log_tail() called by + * jbd2_journal_commit_transaction() may also take + * checkpoint_mutex. So we need to temporarily + * drop it. + */ + mutex_unlock(&journal->j_checkpoint_mutex); jbd2_log_wait_commit(journal, tid); - goto retry; + mutex_lock_io(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + goto restart; } if (!buffer_dirty(bh)) { if (unlikely(buffer_write_io_error(bh)) && !result) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2eb55c3361a8..efd0ce9489ae 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -694,9 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) the last tag we set up. */ tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); - - jbd2_descriptor_block_csum_set(journal, descriptor); start_journal_io: + if (descriptor) + jbd2_descriptor_block_csum_set(journal, + descriptor); + for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8ef6b6daaa7a..382c030cc78b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -142,22 +142,6 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) return cpu_to_be32(csum); } -static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3(j)) - return 1; - - return sb->s_checksum == jbd2_superblock_csum(j, sb); -} - -static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3(j)) - return; - - sb->s_checksum = jbd2_superblock_csum(j, sb); -} - /* * Helper function used to manage commit timeouts */ @@ -1356,6 +1340,10 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } +/* + * This function expects that the caller will have locked the journal + * buffer head, and will return with it unlocked + */ static int jbd2_write_superblock(journal_t *journal, int write_flags) { struct buffer_head *bh = journal->j_sb_buffer; @@ -1365,7 +1353,6 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) write_flags &= ~(REQ_FUA | REQ_PREFLUSH); - lock_buffer(bh); if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal @@ -1381,7 +1368,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } - jbd2_superblock_csum_set(journal, sb); + if (jbd2_journal_has_csum_v2or3(journal)) + sb->s_checksum = jbd2_superblock_csum(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; ret = submit_bh(REQ_OP_WRITE, write_flags, bh); @@ -1424,6 +1412,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); + lock_buffer(journal->j_sb_buffer); sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); @@ -1454,18 +1443,17 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) journal_superblock_t *sb = journal->j_superblock; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - read_lock(&journal->j_state_lock); - /* Is it already empty? */ - if (sb->s_start == 0) { - read_unlock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); + if (sb->s_start == 0) { /* Is it already empty? */ + unlock_buffer(journal->j_sb_buffer); return; } + jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(0); - read_unlock(&journal->j_state_lock); jbd2_write_superblock(journal, write_op); @@ -1488,9 +1476,8 @@ void jbd2_journal_update_sb_errno(journal_t *journal) journal_superblock_t *sb = journal->j_superblock; int errcode; - read_lock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); errcode = journal->j_errno; - read_unlock(&journal->j_state_lock); if (errcode == -ESHUTDOWN) errcode = 0; jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); @@ -1595,17 +1582,18 @@ static int journal_get_superblock(journal_t *journal) } } - /* Check superblock checksum */ - if (!jbd2_superblock_csum_verify(journal, sb)) { - printk(KERN_ERR "JBD2: journal checksum error\n"); - err = -EFSBADCRC; - goto out; - } + if (jbd2_journal_has_csum_v2or3(journal)) { + /* Check superblock checksum */ + if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; + goto out; + } - /* Precompute checksum seed for all metadata */ - if (jbd2_journal_has_csum_v2or3(journal)) + /* Precompute checksum seed for all metadata */ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); + } set_buffer_verified(bh); @@ -1894,28 +1882,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb = journal->j_superblock; + /* Load the checksum driver if necessary */ + if ((journal->j_chksum_driver == NULL) && + INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); + journal->j_chksum_driver = NULL; + return 0; + } + /* Precompute checksum seed for all metadata */ + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + } + + lock_buffer(journal->j_sb_buffer); + /* If enabling v3 checksums, update superblock */ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); - - /* Load the checksum driver */ - if (journal->j_chksum_driver == NULL) { - journal->j_chksum_driver = crypto_alloc_shash("crc32c", - 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c " - "driver.\n"); - journal->j_chksum_driver = NULL; - return 0; - } - - /* Precompute checksum seed for all metadata */ - journal->j_csum_seed = jbd2_chksum(journal, ~0, - sb->s_uuid, - sizeof(sb->s_uuid)); - } } /* If enabling v1 checksums, downgrade superblock */ @@ -1927,6 +1914,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); + unlock_buffer(journal->j_sb_buffer); return 1; #undef COMPAT_FEATURE_ON @@ -2067,7 +2055,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) err = jbd2_journal_skip_recovery(journal); if (write) { /* Lock to make assertions happy... */ - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cc35537232f2..f940d31c2adc 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -63,7 +63,7 @@ void jbd2_journal_free_transaction(transaction_t *transaction) /* * jbd2_get_transaction: obtain a new transaction_t object. * - * Simply allocate and initialise a new transaction. Create it in + * Simply initialise a new transaction. Initialize it in * RUNNING state and add it to the current journal (which should not * have an existing running transaction: we only make a new transaction * once we have started to commit the old one). @@ -75,8 +75,8 @@ void jbd2_journal_free_transaction(transaction_t *transaction) * */ -static transaction_t * -jbd2_get_transaction(journal_t *journal, transaction_t *transaction) +static void jbd2_get_transaction(journal_t *journal, + transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; @@ -100,8 +100,6 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_max_wait = 0; transaction->t_start = jiffies; transaction->t_requested = 0; - - return transaction; } /* @@ -1252,11 +1250,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; char *committed_data = NULL; - JBUFFER_TRACE(jh, "entry"); if (jbd2_write_access_granted(handle, bh, true)) return 0; jh = jbd2_journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done @@ -1367,15 +1366,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - if (!buffer_jbd(bh)) { - ret = -EUCLEAN; - goto out; - } + if (!buffer_jbd(bh)) + return -EUCLEAN; + /* * We don't grab jh reference here since the buffer must be part * of the running transaction. */ jh = bh2jh(bh); + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + /* * This and the following assertions are unreliable since we may see jh * in inconsistent state unless we grab bh_state lock. But this is @@ -1409,9 +1410,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } journal = transaction->t_journal; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); - jbd_lock_bh_state(bh); if (jh->b_modified == 0) { @@ -1597,9 +1595,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_unfile_buffer(jh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __bforget(bh); - goto drop; + goto not_jbd; } } spin_unlock(&journal->j_list_lock); @@ -1609,14 +1605,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ JBUFFER_TRACE(jh, "belongs to older transaction"); - /* ... but we CAN drop it from the new transaction if we - * have also modified it since the original commit. */ + /* ... but we CAN drop it from the new transaction through + * marking the buffer as freed and set j_next_transaction to + * the new transaction, so that not only the commit code + * knows it should clear dirty bits when it is done with the + * buffer, but also the buffer can be checkpointed only + * after the new transaction commits. */ - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == transaction); + set_buffer_freed(bh); + + if (!jh->b_next_transaction) { spin_lock(&journal->j_list_lock); - jh->b_next_transaction = NULL; + jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); + } else { + J_ASSERT(jh->b_next_transaction == transaction); /* * only drop a reference if this transaction modified @@ -1625,9 +1628,40 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (was_modified) drop_reserve = 1; } + } else { + /* + * Finally, if the buffer is not belongs to any + * transaction, we can just drop it now if it has no + * checkpoint. + */ + spin_lock(&journal->j_list_lock); + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "belongs to none transaction"); + spin_unlock(&journal->j_list_lock); + goto not_jbd; + } + + /* + * Otherwise, if the buffer has been written to disk, + * it is safe to remove the checkpoint and drop it. + */ + if (!buffer_dirty(bh)) { + __jbd2_journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + goto not_jbd; + } + + /* + * The buffer is still not written to disk, we should + * attach this buffer to current transaction so that the + * buffer can be checkpointed only after the current + * transaction commits. + */ + clear_buffer_dirty(bh); + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + spin_unlock(&journal->j_list_lock); } -not_jbd: jbd_unlock_bh_state(bh); __brelse(bh); drop: @@ -1636,6 +1670,11 @@ drop: handle->h_buffer_credits++; } return err; + +not_jbd: + jbd_unlock_bh_state(bh); + __bforget(bh); + goto drop; } /** diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 389ea53ea487..bccfc40b3a74 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1414,11 +1414,6 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL); - if (f->target) { - kfree(f->target); - f->target = NULL; - } - fds = f->dents; while(fds) { fd = fds; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index bb6ae387469f..05d892c79339 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -47,7 +47,10 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb) static void jffs2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + + kfree(f->target); + kmem_cache_free(jffs2_inode_cachep, f); } static void jffs2_destroy_inode(struct inode *inode) diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index dba810cd83b1..0b7d197a904c 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -17,6 +17,7 @@ #include <linux/xattr.h> #include <linux/kernfs.h> +#include <linux/fs_context.h> struct kernfs_iattrs { struct iattr ia_iattr; diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f3ac352699cf..9a4646eecb71 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -22,16 +22,6 @@ struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; -static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct kernfs_root *root = kernfs_info(sb)->root; - struct kernfs_syscall_ops *scops = root->syscall_ops; - - if (scops && scops->remount_fs) - return scops->remount_fs(root, flags, data); - return 0; -} - static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); @@ -60,7 +50,6 @@ const struct super_operations kernfs_sops = { .drop_inode = generic_delete_inode, .evict_inode = kernfs_evict_inode, - .remount_fs = kernfs_sop_remount_fs, .show_options = kernfs_sop_show_options, .show_path = kernfs_sop_show_path, }; @@ -222,7 +211,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, } while (true); } -static int kernfs_fill_super(struct super_block *sb, unsigned long magic) +static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) { struct kernfs_super_info *info = kernfs_info(sb); struct inode *inode; @@ -233,7 +222,7 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - sb->s_magic = magic; + sb->s_magic = kfc->magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) @@ -263,21 +252,20 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) return 0; } -static int kernfs_test_super(struct super_block *sb, void *data) +static int kernfs_test_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_super_info *sb_info = kernfs_info(sb); - struct kernfs_super_info *info = data; + struct kernfs_super_info *info = fc->s_fs_info; return sb_info->root == info->root && sb_info->ns == info->ns; } -static int kernfs_set_super(struct super_block *sb, void *data) +static int kernfs_set_super(struct super_block *sb, struct fs_context *fc) { - int error; - error = set_anon_super(sb, data); - if (!error) - sb->s_fs_info = data; - return error; + struct kernfs_fs_context *kfc = fc->fs_private; + + kfc->ns_tag = NULL; + return set_anon_super_fc(sb, fc); } /** @@ -294,63 +282,60 @@ const void *kernfs_super_ns(struct super_block *sb) } /** - * kernfs_mount_ns - kernfs mount helper - * @fs_type: file_system_type of the fs being mounted - * @flags: mount flags specified for the mount - * @root: kernfs_root of the hierarchy being mounted - * @magic: file system specific magic number - * @new_sb_created: tell the caller if we allocated a new superblock - * @ns: optional namespace tag of the mount + * kernfs_get_tree - kernfs filesystem access/retrieval helper + * @fc: The filesystem context. * - * This is to be called from each kernfs user's file_system_type->mount() - * implementation, which should pass through the specified @fs_type and - * @flags, and specify the hierarchy and namespace tag to mount via @root - * and @ns, respectively. - * - * The return value can be passed to the vfs layer verbatim. + * This is to be called from each kernfs user's fs_context->ops->get_tree() + * implementation, which should set the specified ->@fs_type and ->@flags, and + * specify the hierarchy and namespace tag to mount via ->@root and ->@ns, + * respectively. */ -struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, unsigned long magic, - bool *new_sb_created, const void *ns) +int kernfs_get_tree(struct fs_context *fc) { + struct kernfs_fs_context *kfc = fc->fs_private; struct super_block *sb; struct kernfs_super_info *info; int error; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) - return ERR_PTR(-ENOMEM); + return -ENOMEM; - info->root = root; - info->ns = ns; + info->root = kfc->root; + info->ns = kfc->ns_tag; INIT_LIST_HEAD(&info->node); - sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, - &init_user_ns, info); - if (IS_ERR(sb) || sb->s_fs_info != info) - kfree(info); + fc->s_fs_info = info; + sb = sget_fc(fc, kernfs_test_super, kernfs_set_super); if (IS_ERR(sb)) - return ERR_CAST(sb); - - if (new_sb_created) - *new_sb_created = !sb->s_root; + return PTR_ERR(sb); if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); - error = kernfs_fill_super(sb, magic); + kfc->new_sb_created = true; + + error = kernfs_fill_super(sb, kfc); if (error) { deactivate_locked_super(sb); - return ERR_PTR(error); + return error; } sb->s_flags |= SB_ACTIVE; mutex_lock(&kernfs_mutex); - list_add(&info->node, &root->supers); + list_add(&info->node, &info->root->supers); mutex_unlock(&kernfs_mutex); } - return dget(sb->s_root); + fc->root = dget(sb->s_root); + return 0; +} + +void kernfs_free_fs_context(struct fs_context *fc) +{ + /* Note that we don't deal with kfc->ns_tag here. */ + kfree(fc->s_fs_info); + fc->s_fs_info = NULL; } /** @@ -377,36 +362,6 @@ void kernfs_kill_sb(struct super_block *sb) kfree(info); } -/** - * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root - * @kernfs_root: the kernfs_root in question - * @ns: the namespace tag - * - * Pin the superblock so the superblock won't be destroyed in subsequent - * operations. This can be used to block ->kill_sb() which may be useful - * for kernfs users which dynamically manage superblocks. - * - * Returns NULL if there's no superblock associated to this kernfs_root, or - * -EINVAL if the superblock is being freed. - */ -struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) -{ - struct kernfs_super_info *info; - struct super_block *sb = NULL; - - mutex_lock(&kernfs_mutex); - list_for_each_entry(info, &root->supers, node) { - if (info->ns == ns) { - sb = info->sb; - if (!atomic_inc_not_zero(&info->sb->s_active)) - sb = ERR_PTR(-EINVAL); - break; - } - } - mutex_unlock(&kernfs_mutex); - return sb; -} - void __init kernfs_init(void) { diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 214a2fa1f1e3..7df6324ccb8a 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -75,17 +75,6 @@ static void nlm4_compute_offsets(const struct nlm_lock *lock, } /* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("lockd: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NLMv4 basic data types * * Basic NLMv4 data types are defined in Appendix II, section 6.1.4 @@ -176,7 +165,6 @@ out_size: dprintk("NFS: returned cookie was too long: %u\n", length); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -236,7 +224,6 @@ out_bad_xdr: __func__, be32_to_cpup(p)); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -309,7 +296,6 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) out: return error; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 747b9c8c940a..4df62f635529 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -71,17 +71,6 @@ static void nlm_compute_offsets(const struct nlm_lock *lock, } /* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("lockd: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NLMv3 basic data types * * Basic NLMv3 data types are not defined in an IETF standards @@ -173,7 +162,6 @@ out_size: dprintk("NFS: returned cookie was too long: %u\n", length); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -231,7 +219,6 @@ out_enum: __func__, be32_to_cpup(p)); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -303,7 +290,6 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result) out: return error; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 93fb7cf0b92b..f0b5c987d6ae 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -290,12 +290,11 @@ void nlmclnt_release_host(struct nlm_host *host) WARN_ON_ONCE(host->h_server); - if (refcount_dec_and_test(&host->h_count)) { + if (refcount_dec_and_mutex_lock(&host->h_count, &nlm_host_mutex)) { WARN_ON_ONCE(!list_empty(&host->h_lockowners)); WARN_ON_ONCE(!list_empty(&host->h_granted)); WARN_ON_ONCE(!list_empty(&host->h_reclaim)); - mutex_lock(&nlm_host_mutex); nlm_destroy_host_locked(host); mutex_unlock(&nlm_host_mutex); } diff --git a/fs/locks.c b/fs/locks.c index eaa1cfaf73b0..71d0c6c2aac5 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1160,6 +1160,11 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, */ error = -EDEADLK; spin_lock(&blocked_lock_lock); + /* + * Ensure that we don't find any locks blocked on this + * request during deadlock detection. + */ + __locks_wake_up_blocks(request); if (likely(!posix_locks_deadlock(request, fl))) { error = FILE_LOCK_DEFERRED; __locks_insert_block(fl, request, diff --git a/fs/mount.h b/fs/mount.h index f39bc9da4d73..6250de544760 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -146,3 +146,8 @@ static inline bool is_local_mountpoint(struct dentry *dentry) return __is_local_mountpoint(dentry); } + +static inline bool is_anon_ns(struct mnt_namespace *ns) +{ + return ns->seq == 0; +} diff --git a/fs/namei.c b/fs/namei.c index 0a8c5c27f90e..dede0147b3f6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2331,8 +2331,8 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path return err; } -static int filename_lookup(int dfd, struct filename *name, unsigned flags, - struct path *path, struct path *root) +int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root) { int retval; struct nameidata nd; @@ -3460,6 +3460,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } + ima_post_create_tmpfile(inode); return child; out_err: diff --git a/fs/namespace.c b/fs/namespace.c index 98a8c182af4f..c9cab307fa77 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,7 @@ #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> +#include <linux/fs_context.h> #include "pnode.h" #include "internal.h" @@ -940,38 +941,81 @@ static struct mount *skip_mnt_tree(struct mount *p) return p; } -struct vfsmount * -vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) +/** + * vfs_create_mount - Create a mount for a configured superblock + * @fc: The configuration context with the superblock attached + * + * Create a mount to an already configured superblock. If necessary, the + * caller should invoke vfs_get_tree() before calling this. + * + * Note that this does not attach the mount to anything. + */ +struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; - struct dentry *root; - if (!type) - return ERR_PTR(-ENODEV); + if (!fc->root) + return ERR_PTR(-EINVAL); - mnt = alloc_vfsmnt(name); + mnt = alloc_vfsmnt(fc->source ?: "none"); if (!mnt) return ERR_PTR(-ENOMEM); - if (flags & SB_KERNMOUNT) + if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; - root = mount_fs(type, flags, name, data); - if (IS_ERR(root)) { - mnt_free_id(mnt); - free_vfsmnt(mnt); - return ERR_CAST(root); - } + atomic_inc(&fc->root->d_sb->s_active); + mnt->mnt.mnt_sb = fc->root->d_sb; + mnt->mnt.mnt_root = dget(fc->root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; - mnt->mnt.mnt_root = root; - mnt->mnt.mnt_sb = root->d_sb; - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); + list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); return &mnt->mnt; } +EXPORT_SYMBOL(vfs_create_mount); + +struct vfsmount *fc_mount(struct fs_context *fc) +{ + int err = vfs_get_tree(fc); + if (!err) { + up_write(&fc->root->d_sb->s_umount); + return vfs_create_mount(fc); + } + return ERR_PTR(err); +} +EXPORT_SYMBOL(fc_mount); + +struct vfsmount *vfs_kern_mount(struct file_system_type *type, + int flags, const char *name, + void *data) +{ + struct fs_context *fc; + struct vfsmount *mnt; + int ret = 0; + + if (!type) + return ERR_PTR(-EINVAL); + + fc = fs_context_for_mount(type, flags); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + if (name) + ret = vfs_parse_fs_string(fc, "source", + name, strlen(name)); + if (!ret) + ret = parse_monolithic_mount_data(fc, data); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + + put_fs_context(fc); + return mnt; +} EXPORT_SYMBOL_GPL(vfs_kern_mount); struct vfsmount * @@ -1013,27 +1057,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags = old->mnt.mnt_flags; mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); - /* Don't allow unprivileged users to change mount flags */ - if (flag & CL_UNPRIVILEGED) { - mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; - - if (mnt->mnt.mnt_flags & MNT_READONLY) - mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; - - if (mnt->mnt.mnt_flags & MNT_NODEV) - mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; - - if (mnt->mnt.mnt_flags & MNT_NOSUID) - mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; - - if (mnt->mnt.mnt_flags & MNT_NOEXEC) - mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; - } - - /* Don't allow unprivileged users to reveal what is under a mount */ - if ((flag & CL_UNPRIVILEGED) && - (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire))) - mnt->mnt.mnt_flags |= MNT_LOCKED; atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; @@ -1464,6 +1487,29 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) static void shrink_submounts(struct mount *mnt); +static int do_umount_root(struct super_block *sb) +{ + int ret = 0; + + down_write(&sb->s_umount); + if (!sb_rdonly(sb)) { + struct fs_context *fc; + + fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY, + SB_RDONLY); + if (IS_ERR(fc)) { + ret = PTR_ERR(fc); + } else { + ret = parse_monolithic_mount_data(fc, NULL); + if (!ret) + ret = reconfigure_super(fc); + put_fs_context(fc); + } + } + up_write(&sb->s_umount); + return ret; +} + static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; @@ -1529,11 +1575,7 @@ static int do_umount(struct mount *mnt, int flags) */ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; - down_write(&sb->s_umount); - if (!sb_rdonly(sb)) - retval = do_remount_sb(sb, SB_RDONLY, NULL, 0); - up_write(&sb->s_umount); - return retval; + return do_umount_root(sb); } namespace_lock(); @@ -1839,6 +1881,33 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, return 0; } +static void lock_mnt_tree(struct mount *mnt) +{ + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) { + int flags = p->mnt.mnt_flags; + /* Don't allow unprivileged users to change mount flags */ + flags |= MNT_LOCK_ATIME; + + if (flags & MNT_READONLY) + flags |= MNT_LOCK_READONLY; + + if (flags & MNT_NODEV) + flags |= MNT_LOCK_NODEV; + + if (flags & MNT_NOSUID) + flags |= MNT_LOCK_NOSUID; + + if (flags & MNT_NOEXEC) + flags |= MNT_LOCK_NOEXEC; + /* Don't allow unprivileged users to reveal what is under a mount */ + if (list_empty(&p->mnt_expire)) + flags |= MNT_LOCKED; + p->mnt.mnt_flags = flags; + } +} + static void cleanup_group_ids(struct mount *mnt, struct mount *end) { struct mount *p; @@ -1956,6 +2025,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct mountpoint *dest_mp, struct path *parent_path) { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mountpoint *smp; @@ -2006,6 +2076,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, child->mnt_mountpoint); if (q) mnt_change_mountpoint(child, smp, q); + /* Notice when we are propagating across user namespaces */ + if (child->mnt_parent->mnt_ns->user_ns != user_ns) + lock_mnt_tree(child); commit_tree(child); } put_mountpoint(smp); @@ -2313,7 +2386,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, int err; struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); - void *sec_opts = NULL; + struct fs_context *fc; if (!check_mnt(mnt)) return -EINVAL; @@ -2324,24 +2397,22 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; - if (data && !(sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)) { - err = security_sb_eat_lsm_opts(data, &sec_opts); - if (err) - return err; - } - err = security_sb_remount(sb, sec_opts); - security_free_mnt_opts(&sec_opts); - if (err) - return err; + fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK); + if (IS_ERR(fc)) + return PTR_ERR(fc); - down_write(&sb->s_umount); - err = -EPERM; - if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { - err = do_remount_sb(sb, sb_flags, data, 0); - if (!err) - set_mount_attributes(mnt, mnt_flags); + err = parse_monolithic_mount_data(fc, data); + if (!err) { + down_write(&sb->s_umount); + err = -EPERM; + if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { + err = reconfigure_super(fc); + if (!err) + set_mount_attributes(mnt, mnt_flags); + } + up_write(&sb->s_umount); } - up_write(&sb->s_umount); + put_fs_context(fc); return err; } @@ -2425,29 +2496,6 @@ out: return err; } -static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) -{ - int err; - const char *subtype = strchr(fstype, '.'); - if (subtype) { - subtype++; - err = -EINVAL; - if (!subtype[0]) - goto err; - } else - subtype = ""; - - mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); - err = -ENOMEM; - if (!mnt->mnt_sb->s_subtype) - goto err; - return mnt; - - err: - mntput(mnt); - return ERR_PTR(err); -} - /* * add a mount into a namespace's mount tree */ @@ -2492,7 +2540,39 @@ unlock: return err; } -static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags); +static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); + +/* + * Create a new mount using a superblock configuration and request it + * be added to the namespace tree. + */ +static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, + unsigned int mnt_flags) +{ + struct vfsmount *mnt; + struct super_block *sb = fc->root->d_sb; + int error; + + error = security_sb_kern_mount(sb); + if (!error && mount_too_revealing(sb, &mnt_flags)) + error = -EPERM; + + if (unlikely(error)) { + fc_drop_locked(fc); + return error; + } + + up_write(&sb->s_umount); + + mnt = vfs_create_mount(fc); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + if (error < 0) + mntput(mnt); + return error; +} /* * create a new mount for userspace and request it to be added into the @@ -2502,8 +2582,9 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; - struct vfsmount *mnt; - int err; + struct fs_context *fc; + const char *subtype = NULL; + int err = 0; if (!fstype) return -EINVAL; @@ -2512,23 +2593,37 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, if (!type) return -ENODEV; - mnt = vfs_kern_mount(type, sb_flags, name, data); - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && - !mnt->mnt_sb->s_subtype) - mnt = fs_set_subtype(mnt, fstype); + if (type->fs_flags & FS_HAS_SUBTYPE) { + subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + if (!*subtype) { + put_filesystem(type); + return -EINVAL; + } + } else { + subtype = ""; + } + } + fc = fs_context_for_mount(type, sb_flags); put_filesystem(type); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - if (mount_too_revealing(mnt, &mnt_flags)) { - mntput(mnt); - return -EPERM; - } + if (IS_ERR(fc)) + return PTR_ERR(fc); + + if (subtype) + err = vfs_parse_fs_string(fc, "subtype", + subtype, strlen(subtype)); + if (!err && name) + err = vfs_parse_fs_string(fc, "source", name, strlen(name)); + if (!err) + err = parse_monolithic_mount_data(fc, data); + if (!err) + err = vfs_get_tree(fc); + if (!err) + err = do_new_mount_fc(fc, path, mnt_flags); - err = do_add_mount(real_mount(mnt), path, mnt_flags); - if (err) - mntput(mnt); + put_fs_context(fc); return err; } @@ -2863,7 +2958,8 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { - ns_free_inum(&ns->ns); + if (!is_anon_ns(ns)) + ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kfree(ns); @@ -2878,7 +2974,7 @@ static void free_mnt_ns(struct mnt_namespace *ns) */ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); -static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; struct ucounts *ucounts; @@ -2888,28 +2984,27 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) if (!ucounts) return ERR_PTR(-ENOSPC); - new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - dec_mnt_namespaces(ucounts); - return ERR_PTR(ret); + if (!anon) { + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + dec_mnt_namespaces(ucounts); + return ERR_PTR(ret); + } } new_ns->ns.ops = &mntns_operations; - new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); + if (!anon) + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); - new_ns->root = NULL; INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); - new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; - new_ns->mounts = 0; - new_ns->pending_mounts = 0; return new_ns; } @@ -2933,7 +3028,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, old = ns->root; - new_ns = alloc_mnt_ns(user_ns); + new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return new_ns; @@ -2941,13 +3036,18 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; + copy_flags |= CL_SHARED_TO_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); free_mnt_ns(new_ns); return ERR_CAST(new); } + if (user_ns != ns->user_ns) { + lock_mount_hash(); + lock_mnt_tree(new); + unlock_mount_hash(); + } new_ns->root = new; list_add_tail(&new_ns->list, &new->mnt_list); @@ -2988,37 +3088,25 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, return new_ns; } -/** - * create_mnt_ns - creates a private namespace and adds a root filesystem - * @mnt: pointer to the new root filesystem mountpoint - */ -static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) -{ - struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); - if (!IS_ERR(new_ns)) { - struct mount *mnt = real_mount(m); - mnt->mnt_ns = new_ns; - new_ns->root = mnt; - new_ns->mounts++; - list_add(&mnt->mnt_list, &new_ns->list); - } else { - mntput(m); - } - return new_ns; -} - -struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +struct dentry *mount_subtree(struct vfsmount *m, const char *name) { + struct mount *mnt = real_mount(m); struct mnt_namespace *ns; struct super_block *s; struct path path; int err; - ns = create_mnt_ns(mnt); - if (IS_ERR(ns)) + ns = alloc_mnt_ns(&init_user_ns, true); + if (IS_ERR(ns)) { + mntput(m); return ERR_CAST(ns); + } + mnt->mnt_ns = ns; + ns->root = mnt; + ns->mounts++; + list_add(&mnt->mnt_list, &ns->list); - err = vfs_path_lookup(mnt->mnt_root, mnt, + err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); put_mnt_ns(ns); @@ -3228,6 +3316,7 @@ out0: static void __init init_mount_tree(void) { struct vfsmount *mnt; + struct mount *m; struct mnt_namespace *ns; struct path root; struct file_system_type *type; @@ -3240,10 +3329,14 @@ static void __init init_mount_tree(void) if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = create_mnt_ns(mnt); + ns = alloc_mnt_ns(&init_user_ns, false); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); - + m = real_mount(mnt); + m->mnt_ns = ns; + ns->root = m; + ns->mounts = 1; + list_add(&m->mnt_list, &ns->list); init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); @@ -3297,10 +3390,10 @@ void put_mnt_ns(struct mnt_namespace *ns) free_mnt_ns(ns); } -struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) +struct vfsmount *kern_mount(struct file_system_type *type) { struct vfsmount *mnt; - mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data); + mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (!IS_ERR(mnt)) { /* * it is a longterm mount, don't release mnt until @@ -3310,7 +3403,7 @@ struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) } return mnt; } -EXPORT_SYMBOL_GPL(kern_mount_data); +EXPORT_SYMBOL_GPL(kern_mount); void kern_unmount(struct vfsmount *mnt) { @@ -3352,7 +3445,8 @@ bool current_chrooted(void) return chrooted; } -static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, +static bool mnt_already_visible(struct mnt_namespace *ns, + const struct super_block *sb, int *new_mnt_flags) { int new_flags = *new_mnt_flags; @@ -3364,7 +3458,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type) + if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; /* This mount is not fully visible if it's root directory @@ -3415,7 +3509,7 @@ found: return visible; } -static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) +static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) { const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; struct mnt_namespace *ns = current->nsproxy->mnt_ns; @@ -3425,7 +3519,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) return false; /* Can this filesystem be too revealing? */ - s_iflags = mnt->mnt_sb->s_iflags; + s_iflags = sb->s_iflags; if (!(s_iflags & SB_I_USERNS_VISIBLE)) return false; @@ -3435,7 +3529,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) return true; } - return !mnt_already_visible(ns, mnt, new_mnt_flags); + return !mnt_already_visible(ns, sb, new_mnt_flags); } bool mnt_may_suid(struct vfsmount *mnt) @@ -3484,6 +3578,9 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; + if (is_anon_ns(mnt_ns)) + return -EINVAL; + if (fs->users != 1) return -EINVAL; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index a87a56273407..06233bfa6d73 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -72,16 +72,6 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p) return xdr_ressize_check(rqstp, p); } -static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes) -{ - __be32 *p; - - p = xdr_inline_decode(xdr, nbytes); - if (unlikely(p == NULL)) - printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n"); - return p; -} - static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str, size_t maxlen) { @@ -98,13 +88,13 @@ static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh) { __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); fh->size = ntohl(*p); if (fh->size > NFS4_FHSIZE) return htonl(NFS4ERR_BADHANDLE); - p = read_buf(xdr, fh->size); + p = xdr_inline_decode(xdr, fh->size); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); memcpy(&fh->data[0], p, fh->size); @@ -117,11 +107,11 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) __be32 *p; unsigned int attrlen; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); attrlen = ntohl(*p); - p = read_buf(xdr, attrlen << 2); + p = xdr_inline_decode(xdr, attrlen << 2); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); if (likely(attrlen > 0)) @@ -135,7 +125,7 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { __be32 *p; - p = read_buf(xdr, NFS4_STATEID_SIZE); + p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); memcpy(stateid->data, p, NFS4_STATEID_SIZE); @@ -156,7 +146,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ); if (unlikely(status != 0)) return status; - p = read_buf(xdr, 12); + p = xdr_inline_decode(xdr, 12); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); hdr->minorversion = ntohl(*p++); @@ -176,7 +166,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) { __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE_HDR); *op = ntohl(*p); @@ -205,7 +195,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) return status; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); args->truncate = ntohl(*p); @@ -227,7 +217,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, __be32 status = 0; uint32_t iomode; - p = read_buf(xdr, 4 * sizeof(uint32_t)); + p = xdr_inline_decode(xdr, 4 * sizeof(uint32_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); @@ -245,14 +235,14 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, if (unlikely(status != 0)) return status; - p = read_buf(xdr, 2 * sizeof(uint64_t)); + p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); return decode_layout_stateid(xdr, &args->cbl_stateid); } else if (args->cbl_recall_type == RETURN_FSID) { - p = read_buf(xdr, 2 * sizeof(uint64_t)); + p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_fsid.major); @@ -275,7 +265,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, args->ndevs = 0; /* Num of device notifications */ - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto out; @@ -298,7 +288,8 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, for (i = 0; i < n; i++) { struct cb_devicenotifyitem *dev = &args->devs[i]; - p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); + p = xdr_inline_decode(xdr, (4 * sizeof(uint32_t)) + + NFS4_DEVICEID4_SIZE); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto err; @@ -329,7 +320,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto err; @@ -359,7 +350,7 @@ static __be32 decode_sessionid(struct xdr_stream *xdr, { __be32 *p; - p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN); + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); @@ -379,13 +370,13 @@ static __be32 decode_rc_list(struct xdr_stream *xdr, goto out; status = htonl(NFS4ERR_RESOURCE); - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) goto out; rc_list->rcl_nrefcalls = ntohl(*p++); if (rc_list->rcl_nrefcalls) { - p = read_buf(xdr, + p = xdr_inline_decode(xdr, rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); if (unlikely(p == NULL)) goto out; @@ -418,7 +409,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, if (status) return status; - p = read_buf(xdr, 5 * sizeof(uint32_t)); + p = xdr_inline_decode(xdr, 5 * sizeof(uint32_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); @@ -461,7 +452,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, uint32_t bitmap[2]; __be32 *p, status; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->craa_objs_to_keep = ntohl(*p++); @@ -480,7 +471,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp, struct cb_recallslotargs *args = argp; __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->crsa_target_highest_slotid = ntohl(*p++); @@ -492,14 +483,14 @@ static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_arg __be32 *p; unsigned int len; - p = read_buf(xdr, 12); + p = xdr_inline_decode(xdr, 12); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbnl_owner.clientid); len = be32_to_cpu(*p); - p = read_buf(xdr, len); + p = xdr_inline_decode(xdr, len); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); @@ -537,7 +528,7 @@ static __be32 decode_write_response(struct xdr_stream *xdr, __be32 *p; /* skip the always zero field */ - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out; p++; @@ -577,7 +568,7 @@ static __be32 decode_offload_args(struct svc_rqst *rqstp, return status; /* decode status */ - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out; args->error = ntohl(*p++); @@ -943,10 +934,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) }; unsigned int nops = 0; - xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); + xdr_init_decode(&xdr_in, &rqstp->rq_arg, + rqstp->rq_arg.head[0].iov_base, NULL); p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); - xdr_init_encode(&xdr_out, &rqstp->rq_res, p); + xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL); status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); if (status == htonl(NFS4ERR_RESOURCE)) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index fb1cf1a4bda2..90d71fda65ce 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -453,7 +453,7 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; - if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0) + if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 885363ca8569..2f6b447cdd82 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -229,6 +229,8 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation spin_lock(&delegation->lock); if (delegation->inode != NULL) inode = igrab(delegation->inode); + if (!inode) + set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags); spin_unlock(&delegation->lock); return inode; } @@ -681,7 +683,7 @@ void nfs_expire_all_delegations(struct nfs_client *clp) /** * nfs_super_return_all_delegations - return delegations for one superblock - * @sb: sb to process + * @server: pointer to nfs_server to process * */ void nfs_server_return_all_delegations(struct nfs_server *server) @@ -944,10 +946,11 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - if (test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags)) - continue; - if (test_bit(NFS_DELEGATION_NEED_RECLAIM, + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) continue; if (!nfs_sb_active(server->super)) @@ -1053,10 +1056,11 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - if (test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags)) - continue; - if (test_bit(NFS_DELEGATION_TEST_EXPIRED, + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags) == 0) continue; if (!nfs_sb_active(server->super)) diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index dcbf3394ba0e..35b4b02c1ae0 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -34,6 +34,7 @@ enum { NFS_DELEGATION_RETURNING, NFS_DELEGATION_REVOKED, NFS_DELEGATION_TEST_EXPIRED, + NFS_DELEGATION_INODE_FREEING, }; int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 6bf4471850c8..a71d0b42d160 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -139,12 +139,19 @@ struct nfs_cache_array { struct nfs_cache_array_entry array[0]; }; +struct readdirvec { + unsigned long nr; + unsigned long index; + struct page *pages[NFS_MAX_READDIR_RAPAGES]; +}; + typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool); typedef struct { struct file *file; struct page *page; struct dir_context *ctx; unsigned long page_index; + struct readdirvec pvec; u64 *dir_cookie; u64 last_cookie; loff_t current_index; @@ -524,6 +531,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en struct nfs_cache_array *array; unsigned int count = 0; int status; + int max_rapages = NFS_MAX_READDIR_RAPAGES; + + desc->pvec.index = desc->page_index; + desc->pvec.nr = 0; scratch = alloc_page(GFP_KERNEL); if (scratch == NULL) @@ -548,20 +559,40 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en if (desc->plus) nfs_prime_dcache(file_dentry(desc->file), entry); - status = nfs_readdir_add_to_array(entry, page); + status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]); + if (status == -ENOSPC) { + desc->pvec.nr++; + if (desc->pvec.nr == max_rapages) + break; + status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]); + } if (status != 0) break; } while (!entry->eof); + /* + * page and desc->pvec.pages[0] are valid, don't need to check + * whether or not to be NULL. + */ + copy_highpage(page, desc->pvec.pages[0]); + out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = kmap(page); + array = kmap_atomic(desc->pvec.pages[desc->pvec.nr]); array->eof_index = array->size; status = 0; - kunmap(page); + kunmap_atomic(array); } put_page(scratch); + + /* + * desc->pvec.nr > 0 means at least one page was completely filled, + * we should return -ENOSPC. Otherwise function + * nfs_readdir_xdr_to_array will enter infinite loop. + */ + if (desc->pvec.nr > 0) + return -ENOSPC; return status; } @@ -574,8 +605,8 @@ void nfs_readdir_free_pages(struct page **pages, unsigned int npages) } /* - * nfs_readdir_large_page will allocate pages that must be freed with a call - * to nfs_readdir_free_pagearray + * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call + * to nfs_readdir_free_pages() */ static int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages) @@ -595,6 +626,24 @@ out_freepages: return -ENOMEM; } +/* + * nfs_readdir_rapages_init initialize rapages by nfs_cache_array structure. + */ +static +void nfs_readdir_rapages_init(nfs_readdir_descriptor_t *desc) +{ + struct nfs_cache_array *array; + int max_rapages = NFS_MAX_READDIR_RAPAGES; + int index; + + for (index = 0; index < max_rapages; index++) { + array = kmap_atomic(desc->pvec.pages[index]); + memset(array, 0, sizeof(struct nfs_cache_array)); + array->eof_index = -1; + kunmap_atomic(array); + } +} + static int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) { @@ -605,6 +654,12 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, int status = -ENOMEM; unsigned int array_size = ARRAY_SIZE(pages); + /* + * This means we hit readdir rdpages miss, the preallocated rdpages + * are useless, the preallocate rdpages should be reinitialized. + */ + nfs_readdir_rapages_init(desc); + entry.prev_cookie = 0; entry.cookie = desc->last_cookie; entry.eof = 0; @@ -664,9 +719,24 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) struct inode *inode = file_inode(desc->file); int ret; - ret = nfs_readdir_xdr_to_array(desc, page, inode); - if (ret < 0) - goto error; + /* + * If desc->page_index in range desc->pvec.index and + * desc->pvec.index + desc->pvec.nr, we get readdir cache hit. + */ + if (desc->page_index >= desc->pvec.index && + desc->page_index < (desc->pvec.index + desc->pvec.nr)) { + /* + * page and desc->pvec.pages[x] are valid, don't need to check + * whether or not to be NULL. + */ + copy_highpage(page, desc->pvec.pages[desc->page_index - desc->pvec.index]); + ret = 0; + } else { + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) + goto error; + } + SetPageUptodate(page); if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { @@ -831,6 +901,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) *desc = &my_desc; struct nfs_open_dir_context *dir_ctx = file->private_data; int res = 0; + int max_rapages = NFS_MAX_READDIR_RAPAGES; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", file, (long long)ctx->pos); @@ -850,6 +921,12 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = nfs_use_readdirplus(inode, ctx); + res = nfs_readdir_alloc_pages(desc->pvec.pages, max_rapages); + if (res < 0) + return -ENOMEM; + + nfs_readdir_rapages_init(desc); + if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) @@ -885,6 +962,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; } while (!desc->eof); out: + nfs_readdir_free_pages(desc->pvec.pages, max_rapages); if (res > 0) res = 0; dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); @@ -945,7 +1023,7 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, /** * nfs_force_lookup_revalidate - Mark the directory as having changed - * @dir - pointer to directory inode + * @dir: pointer to directory inode * * This forces the revalidation code in nfs_lookup_revalidate() to do a * full lookup on all child dentries of 'dir' whenever a change occurs @@ -1649,7 +1727,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, reval_dentry: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_dentry(dir, dentry, inode);; + return nfs_lookup_revalidate_dentry(dir, dentry, inode); full_reval: return nfs_do_lookup_revalidate(dir, dentry, flags); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 33824a0a57bf..0fd811ac08b5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -428,7 +428,7 @@ out_put: hdr->release(hdr); } -static void nfs_read_sync_pgio_error(struct list_head *head) +static void nfs_read_sync_pgio_error(struct list_head *head, int error) { struct nfs_page *req; @@ -664,8 +664,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) list_for_each_entry_safe(req, tmp, &reqs, wb_list) { if (!nfs_pageio_add_request(&desc, req)) { - nfs_list_remove_request(req); - nfs_list_add_request(req, &failed); + nfs_list_move_request(req, &failed); spin_lock(&cinfo.inode->i_lock); dreq->flags = 0; if (desc.pg_error < 0) @@ -821,7 +820,7 @@ out_put: hdr->release(hdr); } -static void nfs_write_sync_pgio_error(struct list_head *head) +static void nfs_write_sync_pgio_error(struct list_head *head, int error) { struct nfs_page *req; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 29553fdba8af..4899b85f9b3c 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -89,8 +89,8 @@ EXPORT_SYMBOL_GPL(nfs_file_release); /** * nfs_revalidate_size - Revalidate the file size - * @inode - pointer to inode struct - * @file - pointer to struct file + * @inode: pointer to inode struct + * @filp: pointer to struct file * * Revalidates the file length. This is basically a wrapper around * nfs_revalidate_inode() that takes into account the fact that we may @@ -276,6 +276,12 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync); * then a modify/write/read cycle when writing to a page in the * page cache. * + * Some pNFS layout drivers can only read/write at a certain block + * granularity like all block devices and therefore we must perform + * read/modify/write whenever a page hasn't read yet and the data + * to be written there is not aligned to a block boundary and/or + * smaller than the block size. + * * The modify/write/read cycle may occur if a page is read before * being completely filled by the writer. In this situation, the * page must be completely written to stable storage on the server @@ -291,26 +297,32 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync); * and that the new data won't completely replace the old data in * that range of the file. */ -static int nfs_want_read_modify_write(struct file *file, struct page *page, - loff_t pos, unsigned len) +static bool nfs_full_page_write(struct page *page, loff_t pos, unsigned int len) { unsigned int pglen = nfs_page_length(page); unsigned int offset = pos & (PAGE_SIZE - 1); unsigned int end = offset + len; - if (pnfs_ld_read_whole_page(file->f_mapping->host)) { - if (!PageUptodate(page)) - return 1; - return 0; - } + return !pglen || (end >= pglen && !offset); +} - if ((file->f_mode & FMODE_READ) && /* open for read? */ - !PageUptodate(page) && /* Uptodate? */ - !PagePrivate(page) && /* i/o request already? */ - pglen && /* valid bytes of file? */ - (end < pglen || offset)) /* replace all valid bytes? */ - return 1; - return 0; +static bool nfs_want_read_modify_write(struct file *file, struct page *page, + loff_t pos, unsigned int len) +{ + /* + * Up-to-date pages, those with ongoing or full-page write + * don't need read/modify/write + */ + if (PageUptodate(page) || PagePrivate(page) || + nfs_full_page_write(page, pos, len)) + return false; + + if (pnfs_ld_read_whole_page(file->f_mapping->host)) + return true; + /* Open for reading too? */ + if (file->f_mode & FMODE_READ) + return true; + return false; } /* diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 63abe705f4ca..6673d4ff5a2a 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -410,7 +410,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; struct cred *kcred; - const struct cred *cred; + const struct cred __rcu *cred; kuid_t uid; kgid_t gid; u32 ds_count, fh_count, id; @@ -501,7 +501,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; kcred->fsuid = uid; kcred->fsgid = gid; - cred = kcred; + cred = RCU_INITIALIZER(kcred); if (lgr->range.iomode == IOMODE_READ) rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); @@ -788,30 +788,82 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, } } +static void +ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) +{ + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + + if (devid) + nfs4_mark_deviceid_unavailable(devid); +} + +static void +ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx) +{ + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + + if (devid) + nfs4_mark_deviceid_available(devid); +} + static struct nfs4_pnfs_ds * -ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, - int *best_idx) +ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx, + bool check_device) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); + struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; bool fail_return = false; int idx; - /* mirrors are sorted by efficiency */ + /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { if (idx+1 == fls->mirror_array_cnt) - fail_return = true; - ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return); - if (ds) { - *best_idx = idx; - return ds; - } + fail_return = !check_device; + + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return); + if (!ds) + continue; + + if (check_device && + nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) + continue; + + *best_idx = idx; + return ds; } return NULL; } +static struct nfs4_pnfs_ds * +ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx) +{ + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false); +} + +static struct nfs4_pnfs_ds * +ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx) +{ + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true); +} + +static struct nfs4_pnfs_ds * +ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx) +{ + struct nfs4_pnfs_ds *ds; + + ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); + if (ds) + return ds; + return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); +} + static void ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, @@ -925,7 +977,8 @@ retry: goto out_mds; for (i = 0; i < pgio->pg_mirror_count; i++) { - ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true); if (!ds) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; @@ -936,7 +989,6 @@ retry: goto retry; } pgm = &pgio->pg_mirrors[i]; - mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; } @@ -1071,6 +1123,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, break; case -NFS4ERR_RETRY_UNCACHED_REP: break; + case -EAGAIN: + return -NFS4ERR_RESET_TO_PNFS; /* Invalidate Layout errors */ case -NFS4ERR_PNFS_NO_LAYOUT: case -ESTALE: /* mapped NFS4ERR_STALE */ @@ -1131,6 +1185,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, case -EBADHANDLE: case -ELOOP: case -ENOSPC: + case -EAGAIN: break; case -EJUKEBOX: nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); @@ -1158,8 +1213,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task, { int vers = clp->cl_nfs_mod->rpc_vers->number; - if (task->tk_status >= 0) + if (task->tk_status >= 0) { + ff_layout_mark_ds_reachable(lseg, idx); return 0; + } /* Handle the case of an invalid layout segment */ if (!pnfs_is_valid_lseg(lseg)) @@ -1222,6 +1279,8 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, GFP_NOIO); + if (status == NFS4ERR_NXIO) + ff_layout_mark_ds_unreachable(lseg, idx); pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); } @@ -1230,6 +1289,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + int new_idx = hdr->pgio_mirror_idx; int err; trace_nfs4_pnfs_read(hdr, task->tk_status); @@ -1248,8 +1308,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, case -NFS4ERR_RESET_TO_PNFS: if (ff_layout_choose_best_ds_for_read(hdr->lseg, hdr->pgio_mirror_idx + 1, - &hdr->pgio_mirror_idx)) - goto out_eagain; + &new_idx)) + goto out_layouterror; set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: @@ -1260,6 +1320,10 @@ static int ff_layout_read_done_cb(struct rpc_task *task, } return 0; +out_layouterror: + ff_layout_read_record_layoutstats_done(task, hdr); + ff_layout_send_layouterror(hdr->lseg); + hdr->pgio_mirror_idx = new_idx; out_eagain: rpc_restart_call_prepare(task); return -EAGAIN; @@ -1293,15 +1357,6 @@ ff_layout_set_layoutcommit(struct inode *inode, (unsigned long long) NFS_I(inode)->layout->plh_lwb); } -static bool -ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx) -{ - /* No mirroring for now */ - struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); - - return ff_layout_test_devid_unavailable(node); -} - static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { @@ -1332,10 +1387,6 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, rpc_exit(task, -EIO); return -EIO; } - if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { - rpc_exit(task, -EHOSTDOWN); - return -EAGAIN; - } ff_layout_read_record_layoutstats_start(task, hdr); return 0; @@ -1369,6 +1420,16 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) ff_layout_read_prepare_common(task, hdr); } +static void +ff_layout_io_prepare_transmit(struct rpc_task *task, + void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (!pnfs_is_valid_lseg(hdr->lseg)) + rpc_exit(task, -EAGAIN); +} + static void ff_layout_read_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; @@ -1399,9 +1460,10 @@ static void ff_layout_read_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_read_record_layoutstats_done(&hdr->task, hdr); - if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) { + ff_layout_send_layouterror(hdr->lseg); pnfs_read_resend_pnfs(hdr); - else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) ff_layout_reset_read(hdr); pnfs_generic_rw_release(data); } @@ -1513,11 +1575,6 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } - if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { - rpc_exit(task, -EHOSTDOWN); - return -EAGAIN; - } - ff_layout_write_record_layoutstats_start(task, hdr); return 0; } @@ -1573,9 +1630,10 @@ static void ff_layout_write_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_write_record_layoutstats_done(&hdr->task, hdr); - if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) { + ff_layout_send_layouterror(hdr->lseg); ff_layout_reset_write(hdr, true); - else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) ff_layout_reset_write(hdr, false); pnfs_generic_rw_release(data); } @@ -1657,6 +1715,7 @@ static void ff_layout_commit_release(void *data) static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { .rpc_call_prepare = ff_layout_read_prepare_v3, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1664,6 +1723,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { .rpc_call_prepare = ff_layout_read_prepare_v4, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1671,6 +1731,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { .rpc_call_prepare = ff_layout_write_prepare_v3, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, @@ -1678,6 +1739,7 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { .rpc_call_prepare = ff_layout_write_prepare_v4, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, @@ -1703,6 +1765,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; loff_t offset = hdr->args.offset; u32 idx = hdr->pgio_mirror_idx; @@ -1713,20 +1776,21 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) __func__, hdr->inode->i_ino, hdr->args.pgbase, (size_t)hdr->args.count, offset); - ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); if (!ds) goto out_failed; - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, hdr->inode); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror); dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); @@ -1734,13 +1798,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->pgio_done_cb = ff_layout_read_done_cb; refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + fh = nfs4_ff_layout_select_ds_fh(mirror); if (fh) hdr->args.fh = fh; - if (vers == 4 && - !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) - goto out_failed; + nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1770,26 +1832,28 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; loff_t offset = hdr->args.offset; int vers; struct nfs_fh *fh; int idx = hdr->pgio_mirror_idx; - ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); if (!ds) goto out_failed; - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, hdr->inode); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror); dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, @@ -1800,13 +1864,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; hdr->ds_commit_idx = idx; - fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + fh = nfs4_ff_layout_select_ds_fh(mirror); if (fh) hdr->args.fh = fh; - if (vers == 4 && - !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) - goto out_failed; + nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1849,6 +1911,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct pnfs_layout_segment *lseg = data->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; u32 idx; int vers, ret; @@ -1859,20 +1922,21 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) goto out_err; idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); - ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); if (!ds) goto out_err; - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, data->inode); if (IS_ERR(ds_clnt)) goto out_err; - ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred); if (!ds_cred) goto out_err; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror); dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), @@ -2036,7 +2100,7 @@ ff_layout_encode_layoutreturn(struct xdr_stream *xdr, dprintk("%s: Begin\n", __func__); - xdr_init_encode(&tmp_xdr, &tmp_buf, NULL); + xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL); ff_layout_encode_ioerr(&tmp_xdr, args, ff_args); ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args); @@ -2102,6 +2166,52 @@ out_nomem: return -ENOMEM; } +#ifdef CONFIG_NFS_V4_2 +void +ff_layout_send_layouterror(struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_hdr *lo = lseg->pls_layout; + struct nfs42_layout_error *errors; + LIST_HEAD(head); + + if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR)) + return; + ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1); + if (list_empty(&head)) + return; + + errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, + sizeof(*errors), GFP_NOFS); + if (errors != NULL) { + const struct nfs4_ff_layout_ds_err *pos; + size_t n = 0; + + list_for_each_entry(pos, &head, list) { + errors[n].offset = pos->offset; + errors[n].length = pos->length; + nfs4_stateid_copy(&errors[n].stateid, &pos->stateid); + errors[n].errors[0].dev_id = pos->deviceid; + errors[n].errors[0].status = pos->status; + errors[n].errors[0].opnum = pos->opnum; + n++; + if (!list_is_last(&pos->list, &head) && + n < NFS42_LAYOUTERROR_MAX) + continue; + if (nfs42_proc_layouterror(lseg, errors, n) < 0) + break; + n = 0; + } + kfree(errors); + } + ff_layout_free_ds_ioerr(&head); +} +#else +void +ff_layout_send_layouterror(struct pnfs_layout_segment *lseg) +{ +} +#endif + static int ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen) { diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index c2626bad466b..2f369966abf7 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -132,16 +132,6 @@ FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg) generic_hdr); } -static inline struct nfs4_deviceid_node * -FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) -{ - if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt || - FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL || - FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL) - return NULL; - return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node; -} - static inline struct nfs4_ff_layout_ds * FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node) { @@ -151,9 +141,25 @@ FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node) static inline struct nfs4_ff_layout_mirror * FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx) { - if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt) - return NULL; - return FF_LAYOUT_LSEG(lseg)->mirror_array[idx]; + struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); + + if (idx < fls->mirror_array_cnt) + return fls->mirror_array[idx]; + return NULL; +} + +static inline struct nfs4_deviceid_node * +FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx); + + if (mirror != NULL) { + struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds; + + if (!IS_ERR_OR_NULL(mirror_ds)) + return &mirror_ds->id_node; + } + return NULL; } static inline u32 @@ -174,28 +180,10 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO; } -static inline bool -ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) -{ - /* - * Flexfiles should never mark a DS unavailable, but if it does - * print a (ratelimited) warning as this can affect performance. - */ - if (nfs4_test_deviceid_unavailable(node)) { - u32 *p = (u32 *)node->deviceid.data; - - pr_warn_ratelimited("NFS: flexfiles layout referencing an " - "unavailable device [%x%x%x%x]\n", - p[0], p[1], p[2], p[3]); - return true; - } - return false; -} - static inline int -nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx) +nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror) { - return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version; + return mirror->mirror_ds->ds_versions[0].version; } struct nfs4_ff_layout_ds * @@ -207,6 +195,7 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, struct nfs4_ff_layout_mirror *mirror, u64 offset, u64 length, int status, enum nfs_opnum4 opnum, gfp_t gfp_flags); +void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg); int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head); void ff_layout_free_ds_ioerr(struct list_head *head); unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, @@ -214,23 +203,23 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, struct list_head *head, unsigned int maxnum); struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); -int -nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, - u32 mirror_idx, - nfs4_stateid *stateid); +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror); +void +nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, + nfs4_stateid *stateid); struct nfs4_pnfs_ds * -nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, + struct nfs4_ff_layout_mirror *mirror, bool fail_return); struct rpc_clnt * -nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, - u32 ds_idx, +nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, struct nfs_client *ds_clp, struct inode *inode); -const struct cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, - u32 ds_idx, const struct cred *mdscred); +const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, + const struct pnfs_layout_range *range, + const struct cred *mdscred); bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 11766a74216d..a809989807d6 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -183,56 +183,6 @@ out_err: return NULL; } -static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg, - struct nfs4_deviceid_node *devid) -{ - nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); - if (!ff_layout_has_available_ds(lseg)) - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, - lseg); -} - -static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, - struct nfs4_ff_layout_mirror *mirror, - bool create) -{ - if (mirror == NULL || IS_ERR(mirror->mirror_ds)) - goto outerr; - if (mirror->mirror_ds == NULL) { - if (create) { - struct nfs4_deviceid_node *node; - struct pnfs_layout_hdr *lh = lseg->pls_layout; - struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); - - node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), - &mirror->devid, lh->plh_lc_cred, - GFP_KERNEL); - if (node) - mirror_ds = FF_LAYOUT_MIRROR_DS(node); - - /* check for race with another call to this function */ - if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && - mirror_ds != ERR_PTR(-ENODEV)) - nfs4_put_deviceid_node(node); - } else - goto outerr; - } - - if (IS_ERR(mirror->mirror_ds)) - goto outerr; - - if (mirror->mirror_ds->ds == NULL) { - struct nfs4_deviceid_node *devid; - devid = &mirror->mirror_ds->id_node; - ff_layout_mark_devid_invalid(lseg, devid); - return false; - } - return true; -outerr: - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); - return false; -} - static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, u64 offset, u64 length) { @@ -326,7 +276,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, spin_lock(&flo->generic_hdr.plh_inode->i_lock); ff_layout_add_ds_error_locked(flo, dserr); spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - return 0; } @@ -353,46 +302,54 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) } struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); - struct nfs_fh *fh = NULL; - - if (!ff_layout_mirror_valid(lseg, mirror, false)) { - pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", - __func__, mirror_idx); - goto out; - } - /* FIXME: For now assume there is only 1 version available for the DS */ - fh = &mirror->fh_versions[0]; -out: - return fh; + return &mirror->fh_versions[0]; } -int -nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, - u32 mirror_idx, - nfs4_stateid *stateid) +void +nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, + nfs4_stateid *stateid) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); + if (nfs4_ff_layout_ds_version(mirror) == 4) + nfs4_stateid_copy(stateid, &mirror->stateid); +} - if (!ff_layout_mirror_valid(lseg, mirror, false)) { - pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", - __func__, mirror_idx); - goto out; +static bool +ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo, + struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror == NULL) + goto outerr; + if (mirror->mirror_ds == NULL) { + struct nfs4_deviceid_node *node; + struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); + + node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), + &mirror->devid, lo->plh_lc_cred, + GFP_KERNEL); + if (node) + mirror_ds = FF_LAYOUT_MIRROR_DS(node); + + /* check for race with another call to this function */ + if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && + mirror_ds != ERR_PTR(-ENODEV)) + nfs4_put_deviceid_node(node); } - nfs4_stateid_copy(stateid, &mirror->stateid); - return 1; -out: - return 0; + if (IS_ERR(mirror->mirror_ds)) + goto outerr; + + return true; +outerr: + return false; } /** * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call * @lseg: the layout segment we're operating on - * @ds_idx: index of the DS to use + * @mirror: layout mirror describing the DS to use * @fail_return: return layout on connect failure? * * Try to prepare a DS connection to accept an RPC call. This involves @@ -407,26 +364,18 @@ out: * Returns a pointer to a connected DS object on success or NULL on failure. */ struct nfs4_pnfs_ds * -nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, + struct nfs4_ff_layout_mirror *mirror, bool fail_return) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); struct nfs4_pnfs_ds *ds = NULL; - struct nfs4_deviceid_node *devid; struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; int status; - if (!ff_layout_mirror_valid(lseg, mirror, true)) { - pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", - __func__, ds_idx); - goto out; - } - - devid = &mirror->mirror_ds->id_node; - if (ff_layout_test_devid_unavailable(devid)) - goto out_fail; + if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror)) + goto noconnect; ds = mirror->mirror_ds->ds; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ @@ -437,8 +386,8 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. */ - status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, - dataserver_retrans, + status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node, + dataserver_timeo, dataserver_retrans, mirror->mirror_ds->ds_versions[0].version, mirror->mirror_ds->ds_versions[0].minor_version); @@ -453,11 +402,12 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].wsize = max_payload; goto out; } -out_fail: +noconnect: ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); + ff_layout_send_layouterror(lseg); if (fail_return || !ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(ino, lseg); ds = NULL; @@ -466,14 +416,14 @@ out: } const struct cred * -ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, +ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, + const struct pnfs_layout_range *range, const struct cred *mdscred) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); const struct cred *cred; if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) { - cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); + cred = ff_layout_get_mirror_cred(mirror, range->iomode); if (!cred) cred = get_cred(mdscred); } else { @@ -483,15 +433,18 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, } /** -* Find or create a DS rpc client with th MDS server rpc client auth flavor -* in the nfs_client cl_ds_clients list. -*/ + * nfs4_ff_find_or_create_ds_client - Find or create a DS rpc client + * @mirror: pointer to the mirror + * @ds_clp: nfs_client for the DS + * @inode: pointer to inode + * + * Find or create a DS rpc client with th MDS server rpc client auth flavor + * in the nfs_client cl_ds_clients list. + */ struct rpc_clnt * -nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx, +nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, struct nfs_client *ds_clp, struct inode *inode) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - switch (mirror->mirror_ds->ds_versions[0].version) { case 3: /* For NFSv3 DS, flavor is set when creating DS connections */ @@ -608,7 +561,7 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) if (IS_ERR(mirror->mirror_ds)) continue; devid = &mirror->mirror_ds->id_node; - if (!ff_layout_test_devid_unavailable(devid)) + if (!nfs4_test_deviceid_unavailable(devid)) return true; } } @@ -629,7 +582,7 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) if (!mirror->mirror_ds) continue; devid = &mirror->mirror_ds->id_node; - if (ff_layout_test_devid_unavailable(devid)) + if (nfs4_test_deviceid_unavailable(devid)) return false; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 094775ea0781..414a90d48493 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -143,6 +143,7 @@ EXPORT_SYMBOL_GPL(nfs_sync_inode); /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk + * @mapping: pointer to struct address_space */ int nfs_sync_mapping(struct address_space *mapping) { @@ -1184,8 +1185,8 @@ int nfs_attribute_cache_expired(struct inode *inode) /** * nfs_revalidate_inode - Revalidate the inode attributes - * @server - pointer to nfs_server struct - * @inode - pointer to inode struct + * @server: pointer to nfs_server struct + * @inode: pointer to inode struct * * Updates inode attribute information by retrieving the data from the server. */ @@ -1255,8 +1256,8 @@ out: /** * nfs_revalidate_mapping - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping + * @inode: pointer to host inode + * @mapping: pointer to mapping */ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) @@ -1371,8 +1372,8 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) /** * nfs_check_inode_attributes - verify consistency of the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * Verifies the attribute cache. If we have just changed the attributes, * so that fattr carries weak cache consistency data, then it may @@ -1572,8 +1573,8 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle); /** * nfs_inode_attrs_need_update - check if the inode attributes need updating - * @inode - pointer to inode - * @fattr - attributes + * @inode: pointer to inode + * @fattr: attributes * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. @@ -1614,8 +1615,8 @@ static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr /** * nfs_refresh_inode - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * Check that an RPC call that returned attributes has not overlapped with * other recent updates of the inode metadata, then decide whether it is @@ -1649,8 +1650,8 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, /** * nfs_post_op_update_inode - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. @@ -1679,8 +1680,8 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up @@ -1731,8 +1732,8 @@ out_noforce: /** * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b1e577302518..c7cf23ae6597 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -69,7 +69,8 @@ struct nfs_clone_mount { * Maximum number of pages that readdir can use for creating * a vmapped array of pages. */ -#define NFS_MAX_READDIR_PAGES 8 +#define NFS_MAX_READDIR_PAGES 64 +#define NFS_MAX_READDIR_RAPAGES 8 struct nfs_client_initdata { unsigned long init_flags; @@ -755,6 +756,7 @@ static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: + case -EINTR: case -EACCES: case -EDQUOT: case -EFBIG: @@ -763,6 +765,7 @@ static inline bool nfs_error_is_fatal(int err) case -EROFS: case -ESTALE: case -E2BIG: + case -ENOMEM: return true; default: return false; diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 9034b4926909..5088fda9b453 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -25,7 +25,7 @@ static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) /** * nfs_start_io_read - declare the file is being used for buffered reads - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is about to start, and ensure * that we block all direct I/O. @@ -56,7 +56,7 @@ nfs_start_io_read(struct inode *inode) /** * nfs_end_io_read - declare that the buffered read operation is done - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is done, and release the shared * lock on inode->i_rwsem. @@ -69,7 +69,7 @@ nfs_end_io_read(struct inode *inode) /** * nfs_start_io_write - declare the file is being used for buffered writes - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is about to start, and ensure * that we block all direct I/O. @@ -83,7 +83,7 @@ nfs_start_io_write(struct inode *inode) /** * nfs_end_io_write - declare that the buffered write operation is done - * @inode - file inode + * @inode: file inode * * Declare that a buffered write operation is done, and release the * lock on inode->i_rwsem. @@ -105,7 +105,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) /** * nfs_end_io_direct - declare the file is being used for direct i/o - * @inode - file inode + * @inode: file inode * * Declare that a direct I/O operation is about to start, and ensure * that we block all buffered I/O. @@ -136,7 +136,7 @@ nfs_start_io_direct(struct inode *inode) /** * nfs_end_io_direct - declare that the direct i/o operation is done - * @inode - file inode + * @inode: file inode * * Declare that a direct I/O operation is done, and release the shared * lock on inode->i_rwsem. diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index e5686be67be8..15f099a24c29 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -221,10 +221,10 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, /** * nfs_do_submount - set up mountpoint when crossing a filesystem boundary - * @dentry - parent directory - * @fh - filehandle for new root dentry - * @fattr - attributes for new root inode - * @authflavor - security flavor to use when performing the mount + * @dentry: parent directory + * @fh: filehandle for new root dentry + * @fattr: attributes for new root inode + * @authflavor: security flavor to use when performing the mount * */ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 350675e3ed47..a7ed29de0a40 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -22,6 +22,7 @@ #include <linux/nfs.h> #include <linux/nfs2.h> #include <linux/nfs_fs.h> +#include "nfstrace.h" #include "internal.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -55,42 +56,16 @@ #define NFS_attrstat_sz (1+NFS_fattr_sz) #define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz) -#define NFS_readlinkres_sz (2) -#define NFS_readres_sz (1+NFS_fattr_sz+1) +#define NFS_readlinkres_sz (2+1) +#define NFS_readres_sz (1+NFS_fattr_sz+1+1) #define NFS_writeres_sz (NFS_attrstat_sz) #define NFS_stat_sz (1) -#define NFS_readdirres_sz (1) +#define NFS_readdirres_sz (1+1) #define NFS_statfsres_sz (1+NFS_info_sz) static int nfs_stat_to_errno(enum nfs_stat); /* - * While encoding arguments, set up the reply buffer in advance to - * receive reply data directly into the page cache. - */ -static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, - unsigned int base, unsigned int len, - unsigned int bufsize) -{ - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - - replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); -} - -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NFSv2 basic data types * * Basic NFSv2 data types are defined in section 2.3 of RFC 1094: @@ -110,8 +85,8 @@ static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); recvd = xdr_read_pages(xdr, count); if (unlikely(count > recvd)) @@ -125,9 +100,6 @@ out_cheating: "count %u > recvd %u\n", count, recvd); count = recvd; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -157,13 +129,16 @@ static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != cpu_to_be32(NFS_OK))) + goto out_status; + *status = 0; + return 0; +out_status: *status = be32_to_cpup(p); + trace_nfs_xdr_status((int)*status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -205,14 +180,11 @@ static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) __be32 *p; p = xdr_inline_decode(xdr, NFS2_FHSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fh->size = NFS2_FHSIZE; memcpy(fh->data, p, NFS2_FHSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -282,8 +254,8 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, NFS_fattr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fattr->valid |= NFS_ATTR_FATTR_V2; @@ -325,9 +297,6 @@ out_uid: out_gid: dprintk("NFS: returned invalid gid\n"); return -EINVAL; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -416,23 +385,20 @@ static int decode_filename_inline(struct xdr_stream *xdr, u32 count; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (count > NFS3_MAXNAMLEN) goto out_nametoolong; p = xdr_inline_decode(xdr, count); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *name = (const char *)p; *length = count; return 0; out_nametoolong: dprintk("NFS: returned filename too long: %u\n", count); return -ENAMETOOLONG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -455,8 +421,8 @@ static int decode_path(struct xdr_stream *xdr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; length = be32_to_cpup(p); if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN)) goto out_size; @@ -472,9 +438,6 @@ out_cheating: dprintk("NFS: server cheating in pathname result: " "length %u > received %u\n", length, recvd); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -615,8 +578,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, const struct nfs_readlinkargs *args = data; encode_fhandle(xdr, args->fh); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->pglen, NFS_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->pglen, NFS_readlinkres_sz); } /* @@ -651,8 +614,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, const struct nfs_pgio_args *args = data; encode_readargs(xdr, args); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->count, NFS_readres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, NFS_readres_sz); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -809,8 +772,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, const struct nfs_readdirargs *args = data; encode_readdirargs(xdr, args); - prepare_reply_buffer(req, args->pages, 0, - args->count, NFS_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, + args->count, NFS_readdirres_sz); } /* @@ -951,12 +914,12 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int error; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p++ == xdr_zero) { p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p++ == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -964,8 +927,8 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, } p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; entry->ino = be32_to_cpup(p); error = decode_filename_inline(xdr, &entry->name, &entry->len); @@ -978,17 +941,13 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, */ entry->prev_cookie = entry->cookie; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; entry->cookie = be32_to_cpup(p); entry->d_type = DT_UNKNOWN; return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; } /* @@ -1052,17 +1011,14 @@ static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result) __be32 *p; p = xdr_inline_decode(xdr, NFS_info_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->tsize = be32_to_cpup(p++); result->bsize = be32_to_cpup(p++); result->blocks = be32_to_cpup(p++); result->bfree = be32_to_cpup(p++); result->bavail = be32_to_cpup(p); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr, diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9fce18548f7e..c5c3fc6e6c60 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -222,8 +222,6 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, switch (status) { case 0: status = nfs_refresh_inode(inode, fattr); - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 78df4eb60f85..110358f4986d 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -21,6 +21,7 @@ #include <linux/nfs3.h> #include <linux/nfs_fs.h> #include <linux/nfsacl.h> +#include "nfstrace.h" #include "internal.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -68,13 +69,13 @@ #define NFS3_removeres_sz (NFS3_setattrres_sz) #define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) #define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) -#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1) -#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3) +#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+1) +#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+1) #define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4) #define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) #define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz)) #define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) -#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2) +#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+1) #define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13) #define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12) #define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6) @@ -84,7 +85,7 @@ #define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ - XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) static int nfs3_stat_to_errno(enum nfs_stat); @@ -104,32 +105,6 @@ static const umode_t nfs_type2fmt[] = { }; /* - * While encoding arguments, set up the reply buffer in advance to - * receive reply data directly into the page cache. - */ -static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, - unsigned int base, unsigned int len, - unsigned int bufsize) -{ - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - - replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); -} - -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NFSv3 basic data types * * Basic NFSv3 data types are defined in section 2.5 of RFC 1813: @@ -151,13 +126,10 @@ static int decode_uint32(struct xdr_stream *xdr, u32 *value) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *value = be32_to_cpup(p); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_uint64(struct xdr_stream *xdr, u64 *value) @@ -165,13 +137,10 @@ static int decode_uint64(struct xdr_stream *xdr, u64 *value) __be32 *p; p = xdr_inline_decode(xdr, 8); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; xdr_decode_hyper(p, value); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -211,14 +180,14 @@ static int decode_inline_filename3(struct xdr_stream *xdr, u32 count; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (count > NFS3_MAXNAMLEN) goto out_nametoolong; p = xdr_inline_decode(xdr, count); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *name = (const char *)p; *length = count; return 0; @@ -226,9 +195,6 @@ static int decode_inline_filename3(struct xdr_stream *xdr, out_nametoolong: dprintk("NFS: returned filename too long: %u\n", count); return -ENAMETOOLONG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -249,8 +215,8 @@ static int decode_nfspath3(struct xdr_stream *xdr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN)) goto out_nametoolong; @@ -267,9 +233,6 @@ out_cheating: dprintk("NFS: server cheating in pathname result: " "count %u > recvd %u\n", count, recvd); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -303,13 +266,10 @@ static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier) __be32 *p; p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; memcpy(verifier, p, NFS3_COOKIEVERFSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -330,13 +290,10 @@ static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier * __be32 *p; p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; memcpy(verifier->data, p, NFS3_WRITEVERFSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -364,13 +321,16 @@ static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != cpu_to_be32(NFS3_OK))) + goto out_status; + *status = 0; + return 0; +out_status: *status = be32_to_cpup(p); + trace_nfs_xdr_status((int)*status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -453,23 +413,20 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; length = be32_to_cpup(p++); if (unlikely(length > NFS3_FHSIZE)) goto out_toobig; p = xdr_inline_decode(xdr, length); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fh->size = length; memcpy(fh->data, p, length); return 0; out_toobig: dprintk("NFS: file handle size (%u) too big\n", length); return -E2BIG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static void zero_nfs_fh3(struct nfs_fh *fh) @@ -655,8 +612,8 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; p = xdr_decode_ftype3(p, &fmode); @@ -690,9 +647,6 @@ out_uid: out_gid: dprintk("NFS: returned invalid gid\n"); return -EINVAL; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -710,14 +664,11 @@ static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) return decode_fattr3(xdr, fattr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -733,8 +684,8 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fattr->valid |= NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PRECHANGE @@ -747,9 +698,6 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -773,14 +721,11 @@ static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) return decode_wcc_attr(xdr, fattr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr) @@ -808,15 +753,12 @@ out: static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) { __be32 *p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) return decode_nfs_fh3(xdr, fh); zero_nfs_fh3(fh); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -953,8 +895,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, const struct nfs3_readlinkargs *args = data; encode_nfs_fh3(xdr, args->fh); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->pglen, NFS3_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->pglen, NFS3_readlinkres_sz); } /* @@ -986,8 +928,8 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, unsigned int replen = args->replen ? args->replen : NFS3_readres_sz; encode_read3args(xdr, args); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->count, replen); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, replen); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -1279,7 +1221,7 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdir3args(xdr, args); - prepare_reply_buffer(req, args->pages, 0, + rpc_prepare_reply_pages(req, args->pages, 0, args->count, NFS3_readdirres_sz); } @@ -1321,7 +1263,7 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdirplus3args(xdr, args); - prepare_reply_buffer(req, args->pages, 0, + rpc_prepare_reply_pages(req, args->pages, 0, args->count, NFS3_readdirres_sz); } @@ -1366,7 +1308,7 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, encode_nfs_fh3(xdr, args->fh); encode_uint32(xdr, args->mask); if (args->mask & (NFS_ACL | NFS_DFACL)) { - prepare_reply_buffer(req, args->pages, 0, + rpc_prepare_reply_pages(req, args->pages, 0, NFSACL_MAXPAGES << PAGE_SHIFT, ACL3_getaclres_sz); req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; @@ -1643,8 +1585,8 @@ static int decode_read3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 + 4 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p++); eof = be32_to_cpup(p++); ocount = be32_to_cpup(p++); @@ -1667,9 +1609,6 @@ out_cheating: count = recvd; eof = 0; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -1690,7 +1629,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, result->op_status = status; if (status != NFS3_OK) goto out_status; - result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2); + result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2); error = decode_read3resok(xdr, result); out: return error; @@ -1731,22 +1670,18 @@ static int decode_write3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->count = be32_to_cpup(p++); result->verf->committed = be32_to_cpup(p++); if (unlikely(result->verf->committed > NFS_FILE_SYNC)) goto out_badvalue; if (decode_writeverf3(xdr, &result->verf->verifier)) - goto out_eio; + return -EIO; return result->count; out_badvalue: dprintk("NFS: bad stable_how value: %u\n", result->verf->committed); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); -out_eio: - return -EIO; } static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -2010,12 +1945,12 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, u64 new_cookie; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p == xdr_zero) { p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -2051,8 +1986,8 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p != xdr_zero) { error = decode_nfs_fh3(xdr, entry->fh); if (unlikely(error)) { @@ -2069,9 +2004,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; out_truncated: dprintk("NFS: directory entry contains invalid file handle\n"); *entry = old; @@ -2183,8 +2115,8 @@ static int decode_fsstat3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 8 * 6 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; p = xdr_decode_size3(p, &result->tbytes); p = xdr_decode_size3(p, &result->fbytes); p = xdr_decode_size3(p, &result->abytes); @@ -2193,9 +2125,6 @@ static int decode_fsstat3resok(struct xdr_stream *xdr, xdr_decode_size3(p, &result->afiles); /* ignore invarsec */ return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, @@ -2255,8 +2184,8 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->rtmax = be32_to_cpup(p++); result->rtpref = be32_to_cpup(p++); result->rtmult = be32_to_cpup(p++); @@ -2270,9 +2199,6 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, /* ignore properties */ result->lease_time = 0; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, @@ -2328,15 +2254,12 @@ static int decode_pathconf3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 * 6); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->max_link = be32_to_cpup(p++); result->max_namelen = be32_to_cpup(p); /* ignore remaining fields */ return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 19ec38f85ce0..901cca7542f9 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -20,5 +20,8 @@ loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t); +int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, + const struct nfs42_layout_error *errors, + size_t n); #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index fed06fd9998d..5196bfa7894d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -329,9 +329,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, }; ssize_t err, err2; - if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) - return -EOPNOTSUPP; - src_lock = nfs_get_lock_context(nfs_file_open_context(src)); if (IS_ERR(src_lock)) return PTR_ERR(src_lock); @@ -672,6 +669,170 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, return 0; } +static struct nfs42_layouterror_data * +nfs42_alloc_layouterror_data(struct pnfs_layout_segment *lseg, gfp_t gfp_flags) +{ + struct nfs42_layouterror_data *data; + struct inode *inode = lseg->pls_layout->plh_inode; + + data = kzalloc(sizeof(*data), gfp_flags); + if (data) { + data->args.inode = data->inode = nfs_igrab_and_active(inode); + if (data->inode) { + data->lseg = pnfs_get_lseg(lseg); + if (data->lseg) + return data; + nfs_iput_and_deactive(data->inode); + } + kfree(data); + } + return NULL; +} + +static void +nfs42_free_layouterror_data(struct nfs42_layouterror_data *data) +{ + pnfs_put_lseg(data->lseg); + nfs_iput_and_deactive(data->inode); + kfree(data); +} + +static void +nfs42_layouterror_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + struct inode *inode = data->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo = data->lseg->pls_layout; + unsigned i; + + spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + rpc_exit(task, 0); + return; + } + for (i = 0; i < data->args.num_errors; i++) + nfs4_stateid_copy(&data->args.errors[i].stateid, + &lo->plh_stateid); + spin_unlock(&inode->i_lock); + nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, + &data->res.seq_res, task); +} + +static void +nfs42_layouterror_done(struct rpc_task *task, void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + struct inode *inode = data->inode; + struct pnfs_layout_hdr *lo = data->lseg->pls_layout; + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_BADHANDLE: + case -ESTALE: + pnfs_destroy_layout(NFS_I(inode)); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match(&data->args.errors[0].stateid, + &lo->plh_stateid)) { + LIST_HEAD(head); + + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ + pnfs_mark_layout_stateid_invalid(lo, &head); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + nfs_commit_inode(inode, 0); + } else + spin_unlock(&inode->i_lock); + break; + case -NFS4ERR_OLD_STATEID: + spin_lock(&inode->i_lock); + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&data->args.errors[0].stateid, + &lo->plh_stateid)) { + /* Do we need to delay before resending? */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, + &data->args.errors[0].stateid)) + rpc_delay(task, HZ); + rpc_restart_call_prepare(task); + } + spin_unlock(&inode->i_lock); + break; + case -ENOTSUPP: + case -EOPNOTSUPP: + NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR; + } +} + +static void +nfs42_layouterror_release(void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + + nfs42_free_layouterror_data(data); +} + +static const struct rpc_call_ops nfs42_layouterror_ops = { + .rpc_call_prepare = nfs42_layouterror_prepare, + .rpc_call_done = nfs42_layouterror_done, + .rpc_release = nfs42_layouterror_release, +}; + +int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, + const struct nfs42_layout_error *errors, size_t n) +{ + struct inode *inode = lseg->pls_layout->plh_inode; + struct nfs42_layouterror_data *data; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTERROR], + }; + struct rpc_task_setup task_setup = { + .rpc_message = &msg, + .callback_ops = &nfs42_layouterror_ops, + .flags = RPC_TASK_ASYNC, + }; + unsigned int i; + + if (!nfs_server_capable(inode, NFS_CAP_LAYOUTERROR)) + return -EOPNOTSUPP; + if (n > NFS42_LAYOUTERROR_MAX) + return -EINVAL; + data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS); + if (!data) + return -ENOMEM; + for (i = 0; i < n; i++) { + data->args.errors[i] = errors[i]; + data->args.num_errors++; + data->res.num_errors++; + } + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + task_setup.callback_data = data; + task_setup.rpc_client = NFS_SERVER(inode)->client; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0); + task = rpc_run_task(&task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs42_proc_layouterror); + static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct file *dst_f, struct nfs_lock_context *src_lock, struct nfs_lock_context *dst_lock, loff_t src_offset, diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 69f72ed2bf87..aed865a84629 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -51,6 +51,15 @@ 1 /* opaque devaddr4 length */ + \ XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE)) #define decode_layoutstats_maxsz (op_decode_hdr_maxsz) +#define encode_device_error_maxsz (XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \ + 1 /* status */ + 1 /* opnum */) +#define encode_layouterror_maxsz (op_decode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + encode_stateid_maxsz + \ + 1 /* Array size */ + \ + encode_device_error_maxsz) +#define decode_layouterror_maxsz (op_decode_hdr_maxsz) #define encode_clone_maxsz (encode_stateid_maxsz + \ encode_stateid_maxsz + \ 2 /* src offset */ + \ @@ -59,43 +68,53 @@ #define decode_clone_maxsz (op_decode_hdr_maxsz) #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_allocate_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_allocate_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ encode_copy_maxsz + \ encode_commit_maxsz) #define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ decode_copy_maxsz + \ decode_commit_maxsz) #define NFS4_enc_offload_cancel_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_offload_cancel_maxsz) #define NFS4_dec_offload_cancel_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_offload_cancel_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_deallocate_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_seek_maxsz) #define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_seek_maxsz) #define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \ @@ -106,6 +125,16 @@ decode_sequence_maxsz + \ decode_putfh_maxsz + \ PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz) +#define NFS4_enc_layouterror_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + NFS42_LAYOUTERROR_MAX * \ + encode_layouterror_maxsz) +#define NFS4_dec_layouterror_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + NFS42_LAYOUTERROR_MAX * \ + decode_layouterror_maxsz) #define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -223,6 +252,34 @@ static void encode_clone(struct xdr_stream *xdr, xdr_encode_hyper(p, args->count); } +static void encode_device_error(struct xdr_stream *xdr, + const struct nfs42_device_error *error) +{ + __be32 *p; + + p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 2*4); + p = xdr_encode_opaque_fixed(p, error->dev_id.data, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(error->status); + *p = cpu_to_be32(error->opnum); +} + +static void encode_layouterror(struct xdr_stream *xdr, + const struct nfs42_layout_error *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LAYOUTERROR, decode_layouterror_maxsz, hdr); + p = reserve_space(xdr, 8 + 8); + p = xdr_encode_hyper(p, args->offset); + p = xdr_encode_hyper(p, args->length); + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(1); + encode_device_error(xdr, &args->errors[0]); +} + /* * Encode ALLOCATE request */ @@ -381,6 +438,27 @@ static void nfs4_xdr_enc_clone(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode LAYOUTERROR request + */ +static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_layouterror_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + int i; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + for (i = 0; i < args->num_errors; i++) + encode_layouterror(xdr, &args->errors[i], &hdr); + encode_nops(&hdr); +} + static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_ALLOCATE); @@ -394,7 +472,7 @@ static int decode_write_response(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; count = be32_to_cpup(p); if (count > 1) return -EREMOTEIO; @@ -402,18 +480,14 @@ static int decode_write_response(struct xdr_stream *xdr, status = decode_opaque_fixed(xdr, &res->stateid, NFS4_STATEID_SIZE); if (unlikely(status)) - goto out_overflow; + return -EIO; } p = xdr_inline_decode(xdr, 8 + 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->count); res->verifier.committed = be32_to_cpup(p); return decode_verifier(xdr, &res->verifier.verifier); - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_copy_requirements(struct xdr_stream *xdr, @@ -422,14 +496,11 @@ static int decode_copy_requirements(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4 + 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->consecutive = be32_to_cpup(p++); res->synchronous = be32_to_cpup(p++); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res) @@ -474,15 +545,11 @@ static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) p = xdr_inline_decode(xdr, 4 + 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->sr_eof = be32_to_cpup(p++); p = xdr_decode_hyper(p, &res->sr_offset); return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutstats(struct xdr_stream *xdr) @@ -495,6 +562,11 @@ static int decode_clone(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_CLONE); } +static int decode_layouterror(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_LAYOUTERROR); +} + /* * Decode ALLOCATE request */ @@ -704,4 +776,30 @@ out: return status; } +/* + * Decode LAYOUTERROR request + */ +static int nfs4_xdr_dec_layouterror(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_layouterror_res *res = data; + struct compound_hdr hdr; + int status, i; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + + for (i = 0; i < res->num_errors && status == 0; i++) + status = decode_layouterror(xdr); +out: + res->rpc_status = status; + return status; +} + #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 2548405da1f7..1339ede979af 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -42,7 +42,7 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) } #ifdef CONFIG_NFS_V4_1 -/** +/* * Per auth flavor data server rpc clients */ struct nfs4_ds_server { @@ -51,7 +51,9 @@ struct nfs4_ds_server { }; /** - * Common lookup case for DS I/O + * nfs4_find_ds_client - Common lookup case for DS I/O + * @ds_clp: pointer to the DS's nfs_client + * @flavor: rpc auth flavour to match */ static struct nfs4_ds_server * nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor) @@ -118,9 +120,13 @@ nfs4_free_ds_server(struct nfs4_ds_server *dss) } /** -* Find or create a DS rpc client with th MDS server rpc client auth flavor -* in the nfs_client cl_ds_clients list. -*/ + * nfs4_find_or_create_ds_client - Find or create a DS rpc client + * @ds_clp: pointer to the DS's nfs_client + * @inode: pointer to the inode + * + * Find or create a DS rpc client with th MDS server rpc client auth flavor + * in the nfs_client cl_ds_clients list. + */ struct rpc_clnt * nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode) { @@ -145,7 +151,6 @@ static void nfs4_shutdown_ds_clients(struct nfs_client *clp) { struct nfs4_ds_server *dss; - LIST_HEAD(shutdown_list); while (!list_empty(&clp->cl_ds_clients)) { dss = list_entry(clp->cl_ds_clients.next, @@ -284,7 +289,7 @@ static int nfs4_init_callback(struct nfs_client *clp) /** * nfs40_init_client - nfs_client initialization tasks for NFSv4.0 - * @clp - nfs_client to initialize + * @clp: nfs_client to initialize * * Returns zero on success, or a negative errno if some error occurred. */ @@ -312,7 +317,7 @@ int nfs40_init_client(struct nfs_client *clp) /** * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+ - * @clp - nfs_client to initialize + * @clp: nfs_client to initialize * * Returns zero on success, or a negative errno if some error occurred. */ @@ -360,9 +365,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) * nfs4_init_client - Initialise an NFS4 client record * * @clp: nfs_client to initialise - * @timeparms: timeout parameters for underlying RPC transport - * @ip_addr: callback IP address in presentation format - * @authflavor: authentication flavor for underlying RPC transport + * @cl_init: pointer to nfs_client_initdata * * Returns pointer to an NFS client, or an ERR_PTR value. */ @@ -649,13 +652,13 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1, /** * nfs4_detect_session_trunking - Checks for session trunking. - * - * Called after a successful EXCHANGE_ID on a multi-addr connection. - * Upon success, add the transport. - * * @clp: original mount nfs_client * @res: result structure from an exchange_id using the original mount * nfs_client with a new multi_addr transport + * @xprt: pointer to the transport to add. + * + * Called after a successful EXCHANGE_ID on a multi-addr connection. + * Upon success, add the transport. * * Returns zero on success, otherwise -EINVAL * diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 45b2322e092d..00d17198ee12 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,8 +133,10 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { + if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY)) + return -EOPNOTSUPP; if (file_inode(file_in) == file_inode(file_out)) - return -EINVAL; + return -EOPNOTSUPP; return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 24f06dcc2b08..2e460c33ae48 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -137,6 +137,7 @@ static size_t nfs_parse_server_name(char *string, size_t len, /** * nfs_find_best_sec - Find a security mechanism supported locally + * @clnt: pointer to rpc_clnt * @server: NFS server struct * @flavors: List of security tuples returned by SECINFO procedure * @@ -288,8 +289,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, /** * nfs_follow_referral - set up mountpoint when hitting a referral on moved error - * @dentry - parent directory - * @locations - array of NFSv4 server location information + * @dentry: parent directory + * @locations: array of NFSv4 server location information * */ static struct vfsmount *nfs_follow_referral(struct dentry *dentry, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 557a5d636183..741ff8c9c6ed 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -730,33 +730,41 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) res->sr_slot = NULL; } +static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot, + u32 seqnr) +{ + if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0) + slot->seq_nr_highest_sent = seqnr; +} +static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, + u32 seqnr) +{ + slot->seq_nr_highest_sent = seqnr; + slot->seq_nr_last_acked = seqnr; +} + static int nfs41_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) { struct nfs4_session *session; struct nfs4_slot *slot = res->sr_slot; struct nfs_client *clp; - bool interrupted = false; int ret = 1; if (slot == NULL) goto out_noaction; /* don't increment the sequence number if the task wasn't sent */ - if (!RPC_WAS_SENT(task)) + if (!RPC_WAS_SENT(task) || slot->seq_done) goto out; session = slot->table->session; - if (slot->interrupted) { - if (res->sr_status != -NFS4ERR_DELAY) - slot->interrupted = 0; - interrupted = true; - } - trace_nfs4_sequence_done(session, res); /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: + /* Mark this sequence number as having been acked */ + nfs4_slot_sequence_acked(slot, slot->seq_nr); /* Update the slot's sequence and clientid lease timer */ slot->seq_done = 1; clp = session->clp; @@ -771,9 +779,9 @@ static int nfs41_sequence_process(struct rpc_task *task, * sr_status remains 1 if an RPC level error occurred. * The server may or may not have processed the sequence * operation.. - * Mark the slot as having hosted an interrupted RPC call. */ - slot->interrupted = 1; + nfs4_slot_sequence_record_sent(slot, slot->seq_nr); + slot->seq_done = 1; goto out; case -NFS4ERR_DELAY: /* The server detected a resend of the RPC call and @@ -784,6 +792,7 @@ static int nfs41_sequence_process(struct rpc_task *task, __func__, slot->slot_nr, slot->seq_nr); + nfs4_slot_sequence_acked(slot, slot->seq_nr); goto out_retry; case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_SEQ_FALSE_RETRY: @@ -791,6 +800,7 @@ static int nfs41_sequence_process(struct rpc_task *task, * The server thinks we tried to replay a request. * Retry the call after bumping the sequence ID. */ + nfs4_slot_sequence_acked(slot, slot->seq_nr); goto retry_new_seq; case -NFS4ERR_BADSLOT: /* @@ -801,21 +811,28 @@ static int nfs41_sequence_process(struct rpc_task *task, goto session_recover; goto retry_nowait; case -NFS4ERR_SEQ_MISORDERED: + nfs4_slot_sequence_record_sent(slot, slot->seq_nr); /* - * Was the last operation on this sequence interrupted? - * If so, retry after bumping the sequence number. - */ - if (interrupted) - goto retry_new_seq; - /* - * Could this slot have been previously retired? - * If so, then the server may be expecting seq_nr = 1! + * Were one or more calls using this slot interrupted? + * If the server never received the request, then our + * transmitted slot sequence number may be too high. */ - if (slot->seq_nr != 1) { - slot->seq_nr = 1; + if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) { + slot->seq_nr--; goto retry_nowait; } - goto session_recover; + /* + * RFC5661: + * A retry might be sent while the original request is + * still in progress on the replier. The replier SHOULD + * deal with the issue by returning NFS4ERR_DELAY as the + * reply to SEQUENCE or CB_SEQUENCE operation, but + * implementations MAY return NFS4ERR_SEQ_MISORDERED. + * + * Restart the search after a delay. + */ + slot->seq_nr = slot->seq_nr_highest_sent; + goto out_retry; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -906,17 +923,6 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; -static void -nfs4_sequence_process_interrupted(struct nfs_client *client, - struct nfs4_slot *slot, const struct cred *cred) -{ - struct rpc_task *task; - - task = _nfs41_proc_sequence(client, cred, slot, true); - if (!IS_ERR(task)) - rpc_put_task_async(task); -} - #else /* !CONFIG_NFS_V4_1 */ static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -937,16 +943,15 @@ int nfs4_sequence_done(struct rpc_task *task, } EXPORT_SYMBOL_GPL(nfs4_sequence_done); -static void -nfs4_sequence_process_interrupted(struct nfs_client *client, - struct nfs4_slot *slot, const struct cred *cred) +#endif /* !CONFIG_NFS_V4_1 */ + +static void nfs41_sequence_res_init(struct nfs4_sequence_res *res) { - WARN_ON_ONCE(1); - slot->interrupted = 0; + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + res->sr_status = 1; } -#endif /* !CONFIG_NFS_V4_1 */ - static void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -958,10 +963,6 @@ void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, args->sa_slot = slot; res->sr_slot = slot; - res->sr_timestamp = jiffies; - res->sr_status_flags = 0; - res->sr_status = 1; - } int nfs4_setup_sequence(struct nfs_client *client, @@ -982,31 +983,25 @@ int nfs4_setup_sequence(struct nfs_client *client, task->tk_timeout = 0; } - for (;;) { - spin_lock(&tbl->slot_tbl_lock); - /* The state manager will wait until the slot table is empty */ - if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) - goto out_sleep; - - slot = nfs4_alloc_slot(tbl); - if (IS_ERR(slot)) { - /* Try again in 1/4 second */ - if (slot == ERR_PTR(-ENOMEM)) - task->tk_timeout = HZ >> 2; - goto out_sleep; - } - spin_unlock(&tbl->slot_tbl_lock); + spin_lock(&tbl->slot_tbl_lock); + /* The state manager will wait until the slot table is empty */ + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + goto out_sleep; - if (likely(!slot->interrupted)) - break; - nfs4_sequence_process_interrupted(client, - slot, task->tk_msg.rpc_cred); + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + /* Try again in 1/4 second */ + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; + goto out_sleep; } + spin_unlock(&tbl->slot_tbl_lock); nfs4_sequence_attach_slot(args, res, slot); trace_nfs4_setup_sequence(session, args); out_start: + nfs41_sequence_res_init(res); rpc_call_start(task); return 0; @@ -1555,6 +1550,10 @@ static void nfs_clear_open_stateid(struct nfs4_state *state, static void nfs_set_open_stateid_locked(struct nfs4_state *state, const nfs4_stateid *stateid, nfs4_stateid *freeme) + __must_hold(&state->owner->so_lock) + __must_hold(&state->seqlock) + __must_hold(RCU) + { DEFINE_WAIT(wait); int status = 0; @@ -2934,7 +2933,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, } out: - nfs4_sequence_free_slot(&opendata->o_res.seq_res); + if (!opendata->cancelled) + nfs4_sequence_free_slot(&opendata->o_res.seq_res); return ret; } @@ -5963,7 +5963,7 @@ out: /** * nfs4_proc_setclientid_confirm - Confirm client ID * @clp: state data structure - * @res: result of a previous SETCLIENTID + * @arg: result of a previous SETCLIENTID * @cred: credential to use for this call * * Returns zero, a negative errno, or a negative NFS4ERR status code. @@ -6302,7 +6302,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->arg.seqid = seqid; p->res.seqid = seqid; p->lsp = lsp; - refcount_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); p->l_ctx = nfs_get_lock_context(ctx); @@ -6527,7 +6526,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->res.lock_seqid = p->arg.lock_seqid; p->lsp = lsp; p->server = server; - refcount_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); @@ -7527,7 +7525,7 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred) return status; } -/** +/* * If 'use_integrity' is true and the state managment nfs_client * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient * and the machine credential as per RFC3530bis and RFC5661 Security @@ -8937,10 +8935,12 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) if (status != 0) goto out; - /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ - if (task->tk_status < 0 || lgp->res.layoutp->len == 0) { + if (task->tk_status < 0) { status = nfs4_layoutget_handle_exception(task, lgp, &exception); *timeout = exception.timeout; + } else if (lgp->res.layoutp->len == 0) { + status = -EAGAIN; + *timeout = nfs4_update_delay(&exception.timeout); } else lseg = pnfs_layout_process(lgp); out: @@ -9219,7 +9219,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) return status; } -/** +/* * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if * possible) as per RFC3530bis and RFC5661 Security Considerations sections */ @@ -9484,7 +9484,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { * @server: server / transport on which to perform the operation * @stateid: state ID to release * @cred: credential - * @is_recovery: set to true if this call needs to be privileged + * @privileged: set to true if this call needs to be privileged * * Note: this function is always asynchronous. */ @@ -9691,7 +9691,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_DEALLOCATE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS - | NFS_CAP_CLONE, + | NFS_CAP_CLONE + | NFS_CAP_LAYOUTERROR, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index a5489d70a724..bcb532def9e2 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -55,7 +55,7 @@ static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize) /** * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete - * @tbl - controlling slot table + * @tbl: controlling slot table * */ void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) @@ -110,6 +110,8 @@ static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl, slot->table = tbl; slot->slot_nr = slotid; slot->seq_nr = seq_init; + slot->seq_nr_highest_sent = seq_init; + slot->seq_nr_last_acked = seq_init - 1; } return slot; } @@ -276,7 +278,8 @@ static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, p = &tbl->slots; while (*p) { (*p)->seq_nr = ivalue; - (*p)->interrupted = 0; + (*p)->seq_nr_highest_sent = ivalue; + (*p)->seq_nr_last_acked = ivalue - 1; p = &(*p)->next; } tbl->highest_used_slotid = NFS4_NO_SLOT; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 3c550f297561..b996ee23f1ba 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -10,7 +10,7 @@ /* maximum number of slots to use */ #define NFS4_DEF_SLOT_TABLE_SIZE (64U) -#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U) +#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U) #define NFS4_MAX_SLOT_TABLE (1024U) #define NFS4_NO_SLOT ((u32)-1) @@ -23,8 +23,9 @@ struct nfs4_slot { unsigned long generation; u32 slot_nr; u32 seq_nr; - unsigned int interrupted : 1, - privileged : 1, + u32 seq_nr_last_acked; + u32 seq_nr_highest_sent; + unsigned int privileged : 1, seq_done : 1; }; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 02488b50534a..3de36479ed7a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -563,6 +563,7 @@ static void nfs4_gc_state_owners(struct nfs_server *server) * nfs4_get_state_owner - Look up a state owner given a credential * @server: nfs_server to search * @cred: RPC credential to match + * @gfp_flags: allocation mode * * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL. */ diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index b4557cf685fb..cd1a5c08da9a 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -524,6 +524,31 @@ TRACE_EVENT(nfs4_setup_sequence, ) ); +TRACE_EVENT(nfs4_xdr_status, + TP_PROTO( + u32 op, + int error + ), + + TP_ARGS(op, error), + + TP_STRUCT__entry( + __field(u32, op) + __field(int, error) + ), + + TP_fast_assign( + __entry->op = op; + __entry->error = -error; + ), + + TP_printk( + "operation %d: nfs status %d (%s)", + __entry->op, + __entry->error, show_nfsv4_errors(__entry->error) + ) +); + DECLARE_EVENT_CLASS(nfs4_open_event, TP_PROTO( const struct nfs_open_context *ctx, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 2fc8f6fa25e4..602446158bfb 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -54,6 +54,7 @@ #include <linux/nfs_fs.h> #include "nfs4_fs.h" +#include "nfs4trace.h" #include "internal.h" #include "nfs4idmap.h" #include "nfs4session.h" @@ -214,14 +215,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, nfs4_fattr_bitmap_maxsz) #define encode_read_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) -#define decode_read_maxsz (op_decode_hdr_maxsz + 2) +#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + 1) #define encode_readdir_maxsz (op_encode_hdr_maxsz + \ 2 + encode_verifier_maxsz + 5 + \ nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ - decode_verifier_maxsz) + decode_verifier_maxsz + 1) #define encode_readlink_maxsz (op_encode_hdr_maxsz) -#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) +#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + 1) #define encode_write_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 4) #define decode_write_maxsz (op_decode_hdr_maxsz + \ @@ -283,14 +284,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define decode_delegreturn_maxsz (op_decode_hdr_maxsz) #define encode_getacl_maxsz (encode_getattr_maxsz) #define decode_getacl_maxsz (op_decode_hdr_maxsz + \ - nfs4_fattr_bitmap_maxsz + 1) + nfs4_fattr_bitmap_maxsz + 1 + 1) #define encode_setacl_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) #define decode_setacl_maxsz (decode_setattr_maxsz) #define encode_fs_locations_maxsz \ (encode_getattr_maxsz) #define decode_fs_locations_maxsz \ - (0) + (1) #define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) #define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) @@ -391,12 +392,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, 1 /* opaque devaddr4 length */ + \ /* devaddr4 payload is read into page */ \ 1 /* notification bitmap length */ + \ - 1 /* notification bitmap, word 0 */) + 1 /* notification bitmap, word 0 */ + \ + 1 /* possible XDR padding */) #define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ encode_stateid_maxsz) #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ decode_stateid_maxsz + \ - XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1) #define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ 2 /* offset */ + \ 2 /* length */ + \ @@ -1015,12 +1017,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - struct rpc_auth *auth = req->rq_cred->cr_auth; /* initialize running count of expected bytes in reply. * NOTE: the replied tag SHOULD be the same is the one sent, * but this is not required as a MUST for the server to do so. */ - hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; + hdr->replen = 3 + hdr->taglen; WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN); encode_string(xdr, hdr->taglen, hdr->tag); @@ -2340,9 +2341,9 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); if (args->lg_args) { encode_layoutget(xdr, args->lg_args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->lg_args->layout.pages, - 0, args->lg_args->layout.pglen); + rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, + args->lg_args->layout.pglen, + hdr.replen); } encode_nops(&hdr); } @@ -2386,9 +2387,9 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); if (args->lg_args) { encode_layoutget(xdr, args->lg_args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->lg_args->layout.pages, - 0, args->lg_args->layout.pglen); + rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, + args->lg_args->layout.pglen, + hdr.replen); } encode_nops(&hdr); } @@ -2498,8 +2499,8 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_readlink(xdr, args, req, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, - args->pgbase, args->pglen); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->pglen, hdr.replen); encode_nops(&hdr); } @@ -2519,11 +2520,8 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_readdir(xdr, args, req, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, - args->pgbase, args->count); - dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", - __func__, hdr.replen << 2, args->pages, - args->pgbase, args->count); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, hdr.replen); encode_nops(&hdr); } @@ -2543,8 +2541,8 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_read(xdr, args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->pages, args->pgbase, args->count); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, hdr.replen); req->rq_rcv_buf.flags |= XDRBUF_READ; encode_nops(&hdr); } @@ -2590,9 +2588,8 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, encode_getattr(xdr, nfs4_acl_bitmap, NULL, ARRAY_SIZE(nfs4_acl_bitmap), &hdr); - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, - args->acl_pages, 0, args->acl_len); - + rpc_prepare_reply_pages(req, args->acl_pages, 0, + args->acl_len, replen + 1); encode_nops(&hdr); } @@ -2813,9 +2810,8 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, encode_fs_locations(xdr, args->bitmask, &hdr); } - /* Set up reply kvec to capture returned fs_locations array. */ - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, - (struct page **)&args->page, 0, PAGE_SIZE); + rpc_prepare_reply_pages(req, (struct page **)&args->page, 0, + PAGE_SIZE, replen + 1); encode_nops(&hdr); } @@ -3017,10 +3013,8 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, /* set up reply kvec. Subtract notification bitmap max size (2) * so that notification bitmap is put in xdr_buf tail */ - xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2, - args->pdev->pages, args->pdev->pgbase, - args->pdev->pglen); - + rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase, + args->pdev->pglen, hdr.replen - 2); encode_nops(&hdr); } @@ -3041,9 +3035,8 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, encode_putfh(xdr, NFS_FH(args->inode), &hdr); encode_layoutget(xdr, args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->layout.pages, 0, args->layout.pglen); - + rpc_prepare_reply_pages(req, args->layout.pages, 0, + args->layout.pglen, hdr.replen); encode_nops(&hdr); } @@ -3144,22 +3137,12 @@ static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req, } #endif /* CONFIG_NFS_V4_1 */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("nfs: %s: prematurely hit end of receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string, NFS4_OPAQUE_LIMIT); - if (unlikely(ret < 0)) { - if (ret == -EBADMSG) - print_overflow_msg(__func__, xdr); + if (unlikely(ret < 0)) return -EIO; - } *len = ret; return 0; } @@ -3170,22 +3153,19 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; hdr->status = be32_to_cpup(p++); hdr->taglen = be32_to_cpup(p); p = xdr_inline_decode(xdr, hdr->taglen + 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); hdr->nops = be32_to_cpup(p); if (unlikely(hdr->nops < 1)) return nfs4_stat_to_errno(hdr->status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, @@ -3201,11 +3181,14 @@ static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, opnum = be32_to_cpup(p++); if (unlikely(opnum != expected)) goto out_bad_operation; + if (unlikely(*p != cpu_to_be32(NFS_OK))) + goto out_status; + *nfs_retval = 0; + return true; +out_status: nfserr = be32_to_cpup(p); - if (nfserr == NFS_OK) - *nfs_retval = 0; - else - *nfs_retval = nfs4_stat_to_errno(nfserr); + trace_nfs4_xdr_status(opnum, nfserr); + *nfs_retval = nfs4_stat_to_errno(nfserr); return true; out_bad_operation: dprintk("nfs: Server returned operation" @@ -3214,7 +3197,6 @@ out_bad_operation: *nfs_retval = -EREMOTEIO; return false; out_overflow: - print_overflow_msg(__func__, xdr); *nfs_retval = -EIO; return false; } @@ -3235,10 +3217,9 @@ static int decode_ace(struct xdr_stream *xdr, void *ace) char *str; p = xdr_inline_decode(xdr, 12); - if (likely(p)) - return decode_opaque_inline(xdr, &strlen, &str); - print_overflow_msg(__func__, xdr); - return -EIO; + if (unlikely(!p)) + return -EIO; + return decode_opaque_inline(xdr, &strlen, &str); } static ssize_t @@ -3249,10 +3230,9 @@ decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz) ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz); if (likely(ret >= 0)) return ret; - if (ret == -EMSGSIZE) - return sz; - print_overflow_msg(__func__, xdr); - return -EIO; + if (ret != -EMSGSIZE) + return -EIO; + return sz; } static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) @@ -3268,13 +3248,10 @@ static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigne p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *attrlen = be32_to_cpup(p); *savep = xdr_stream_pos(xdr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) @@ -3303,7 +3280,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *type = be32_to_cpup(p); if (*type < NF4REG || *type > NF4NAMEDATTR) { dprintk("%s: bad type %d\n", __func__, *type); @@ -3314,9 +3291,6 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * } dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fh_expire_type(struct xdr_stream *xdr, @@ -3330,15 +3304,12 @@ static int decode_attr_fh_expire_type(struct xdr_stream *xdr, if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *type = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE; } dprintk("%s: expire type=0x%x\n", __func__, *type); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) @@ -3352,7 +3323,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; ret = NFS_ATTR_FATTR_CHANGE; @@ -3360,9 +3331,6 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) @@ -3376,16 +3344,13 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3398,15 +3363,12 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3419,15 +3381,12 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) @@ -3442,7 +3401,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { p = xdr_inline_decode(xdr, 16); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &fsid->major); xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; @@ -3452,9 +3411,6 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs (unsigned long long)fsid->major, (unsigned long long)fsid->minor); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3467,15 +3423,12 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res) @@ -3487,14 +3440,11 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t * if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; *res = -be32_to_cpup(p); } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_exclcreat_supported(struct xdr_stream *xdr, @@ -3526,13 +3476,13 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; if (fh != NULL) { memcpy(fh->data, p, len); fh->size = len; @@ -3540,9 +3490,6 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3555,15 +3502,12 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -3577,16 +3521,13 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -3600,16 +3541,13 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3623,15 +3561,12 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3645,15 +3580,12 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3667,15 +3599,12 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) @@ -3686,7 +3615,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; n = be32_to_cpup(p); if (n == 0) goto root_path; @@ -3718,9 +3647,6 @@ out_eio: dprintk(" status %d", status); status = -EIO; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) @@ -3745,7 +3671,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st goto out; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + goto out_eio; n = be32_to_cpup(p); if (n <= 0) goto out_eio; @@ -3758,7 +3684,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st loc = &res->locations[res->nlocations]; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + goto out_eio; m = be32_to_cpup(p); dprintk("%s: servers:\n", __func__); @@ -3796,8 +3722,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; -out_overflow: - print_overflow_msg(__func__, xdr); out_eio: status = -EIO; goto out; @@ -3814,15 +3738,12 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) @@ -3836,15 +3757,12 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *maxlink = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } dprintk("%s: maxlink=%u\n", __func__, *maxlink); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) @@ -3858,15 +3776,12 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *maxname = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } dprintk("%s: maxname=%u\n", __func__, *maxname); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3881,7 +3796,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ uint64_t maxread; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; @@ -3890,9 +3805,6 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ } dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3907,7 +3819,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 uint64_t maxwrite; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; @@ -3916,9 +3828,6 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 } dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) @@ -3933,7 +3842,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; tmp = be32_to_cpup(p); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; @@ -3941,9 +3850,6 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) @@ -3957,16 +3863,13 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *nlink = be32_to_cpup(p); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static ssize_t decode_nfs4_string(struct xdr_stream *xdr, @@ -4011,10 +3914,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, return NFS_ATTR_FATTR_OWNER; } out: - if (len != -EBADMSG) - return 0; - print_overflow_msg(__func__, xdr); - return -EIO; + if (len == -EBADMSG) + return -EIO; + return 0; } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, @@ -4046,10 +3948,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, return NFS_ATTR_FATTR_GROUP; } out: - if (len != -EBADMSG) - return 0; - print_overflow_msg(__func__, xdr); - return -EIO; + if (len == -EBADMSG) + return -EIO; + return 0; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) @@ -4066,7 +3967,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; major = be32_to_cpup(p++); minor = be32_to_cpup(p); tmp = MKDEV(major, minor); @@ -4077,9 +3978,6 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4093,15 +3991,12 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4115,15 +4010,12 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4137,15 +4029,12 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) @@ -4159,7 +4048,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; ret = NFS_ATTR_FATTR_SPACE_USED; @@ -4167,9 +4056,6 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static __be32 * @@ -4189,12 +4075,9 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) p = xdr_inline_decode(xdr, nfstime4_maxsz << 2); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_nfstime4(p, time); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -4265,19 +4148,19 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; lfs = be32_to_cpup(p++); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; pi = be32_to_cpup(p++); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p++); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; if (len < NFS4_MAXLABELLEN) { if (label) { memcpy(label->label, p, len); @@ -4295,10 +4178,6 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, (char *)label->label, label->len, label->pi, label->lfs); return status; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -4342,14 +4221,11 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c p = xdr_inline_decode(xdr, 20); if (unlikely(!p)) - goto out_overflow; + return -EIO; cinfo->atomic = be32_to_cpup(p++); p = xdr_decode_hyper(p, &cinfo->before); xdr_decode_hyper(p, &cinfo->after); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) @@ -4363,24 +4239,19 @@ static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) return status; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; supp = be32_to_cpup(p++); acc = be32_to_cpup(p); *supported = supp; *access = acc; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) { ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len); - if (unlikely(ret < 0)) { - print_overflow_msg(__func__, xdr); + if (unlikely(ret < 0)) return -EIO; - } return 0; } @@ -4460,13 +4331,11 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; bmlen = be32_to_cpup(p); p = xdr_inline_decode(xdr, bmlen << 2); if (likely(p)) return 0; -out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -4574,13 +4443,10 @@ static int decode_threshold_hint(struct xdr_stream *xdr, if (likely(bitmap[0] & hint_bit)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_first_threshold_item4(struct xdr_stream *xdr, @@ -4593,10 +4459,8 @@ static int decode_first_threshold_item4(struct xdr_stream *xdr, /* layout type */ p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } res->l_type = be32_to_cpup(p); /* thi_hintset bitmap */ @@ -4654,7 +4518,7 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, return -EREMOTEIO; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; num = be32_to_cpup(p); if (num == 0) return 0; @@ -4667,9 +4531,6 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD; } return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, @@ -4857,7 +4718,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; fsinfo->nlayouttypes = be32_to_cpup(p); /* pNFS is not supported by the underlying file system */ @@ -4867,7 +4728,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, /* Decode and set first layout type, move xdr->p past unused types */ p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; /* If we get too many, then just cap it at the max */ if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) { @@ -4879,9 +4740,6 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, for(i = 0; i < fsinfo->nlayouttypes; ++i) fsinfo->layouttype[i] = be32_to_cpup(p++); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -4915,10 +4773,8 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, *res = 0; if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } *res = be32_to_cpup(p); bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; } @@ -4937,10 +4793,8 @@ static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap, *res = 0; if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } *res = be32_to_cpup(p); bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE; } @@ -5016,19 +4870,16 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; fh->size = len; p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; memcpy(fh->data, p, len); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -5052,7 +4903,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) p = xdr_inline_decode(xdr, 32); /* read 32 bytes */ if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */ p = xdr_decode_hyper(p, &length); type = be32_to_cpup(p++); /* 4 byte read */ @@ -5069,11 +4920,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ p = xdr_inline_decode(xdr, namelen); /* variable size field */ - if (likely(p)) - return -NFS4ERR_DENIED; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; + if (likely(!p)) + return -EIO; + return -NFS4ERR_DENIED; } static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) @@ -5142,7 +4991,7 @@ static int decode_space_limit(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; limit_type = be32_to_cpup(p++); switch (limit_type) { case NFS4_LIMIT_SIZE: @@ -5156,9 +5005,6 @@ static int decode_space_limit(struct xdr_stream *xdr, maxsize >>= PAGE_SHIFT; *pagemod_limit = min_t(u64, maxsize, ULONG_MAX); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_rw_delegation(struct xdr_stream *xdr, @@ -5173,7 +5019,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->do_recall = be32_to_cpup(p); switch (delegation_type) { @@ -5186,9 +5032,6 @@ static int decode_rw_delegation(struct xdr_stream *xdr, return -EIO; } return decode_ace(xdr, NULL); -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) @@ -5198,7 +5041,7 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; why_no_delegation = be32_to_cpup(p); switch (why_no_delegation) { case WND4_CONTENTION: @@ -5207,9 +5050,6 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) /* Ignore for now */ } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) @@ -5219,7 +5059,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; delegation_type = be32_to_cpup(p); res->delegation_type = 0; switch (delegation_type) { @@ -5232,9 +5072,6 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) return decode_no_delegation(xdr, res); } return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) @@ -5256,7 +5093,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->rflags = be32_to_cpup(p++); bmlen = be32_to_cpup(p); if (bmlen > 10) @@ -5264,7 +5101,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, bmlen << 2); if (unlikely(!p)) - goto out_overflow; + return -EIO; savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) res->attrset[i] = be32_to_cpup(p++); @@ -5275,9 +5112,6 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) xdr_error: dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) @@ -5326,7 +5160,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, return status; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; eof = be32_to_cpup(p++); count = be32_to_cpup(p); recvd = xdr_read_pages(xdr, count); @@ -5339,9 +5173,6 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, res->eof = eof; res->count = count; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) @@ -5374,7 +5205,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) /* Convert length of symlink */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); @@ -5395,9 +5226,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) */ xdr_terminate_string(rcvbuf, len); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -5500,7 +5328,6 @@ static int decode_setattr(struct xdr_stream *xdr) return status; if (decode_bitmap4(xdr, NULL, 0) >= 0) return 0; - print_overflow_msg(__func__, xdr); return -EIO; } @@ -5512,7 +5339,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; opnum = be32_to_cpup(p++); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" @@ -5523,7 +5350,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re if (nfserr == NFS_OK) { p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->clientid); memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { @@ -5532,28 +5359,25 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re /* skip netid string */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; /* skip uaddr string */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; return -NFSERR_CLID_INUSE; } else return nfs4_stat_to_errno(nfserr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_setclientid_confirm(struct xdr_stream *xdr) @@ -5572,13 +5396,10 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res) p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->count = be32_to_cpup(p++); res->verf->committed = be32_to_cpup(p++); return decode_write_verifier(xdr, &res->verf->verifier); -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_delegreturn(struct xdr_stream *xdr) @@ -5594,30 +5415,24 @@ static int decode_secinfo_gss(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; oid_len = be32_to_cpup(p); if (oid_len > GSS_OID_MAX_LEN) - goto out_err; + return -EINVAL; p = xdr_inline_decode(xdr, oid_len); if (unlikely(!p)) - goto out_overflow; + return -EIO; memcpy(flavor->flavor_info.oid.data, p, oid_len); flavor->flavor_info.oid.len = oid_len; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; flavor->flavor_info.qop = be32_to_cpup(p++); flavor->flavor_info.service = be32_to_cpup(p); return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; -out_err: - return -EINVAL; } static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) @@ -5629,7 +5444,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->flavors->num_flavors = 0; num_flavors = be32_to_cpup(p); @@ -5641,7 +5456,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; sec_flavor->flavor = be32_to_cpup(p); if (sec_flavor->flavor == RPC_AUTH_GSS) { @@ -5655,9 +5470,6 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res status = 0; out: return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) @@ -5711,11 +5523,11 @@ static int decode_exchange_id(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &res->clientid); p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->seqid = be32_to_cpup(p++); res->flags = be32_to_cpup(p++); @@ -5739,7 +5551,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* server_owner4.so_minor_id */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->server_owner->minor_id); /* server_owner4.so_major_id */ @@ -5759,7 +5571,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* Implementation Id */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; impl_id_count = be32_to_cpup(p++); if (impl_id_count) { @@ -5778,16 +5590,13 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* nii_date */ p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->impl_id->date.seconds); res->impl_id->date.nseconds = be32_to_cpup(p); /* if there's more than one entry, ignore the rest */ } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_chan_attrs(struct xdr_stream *xdr, @@ -5798,7 +5607,7 @@ static int decode_chan_attrs(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 28); if (unlikely(!p)) - goto out_overflow; + return -EIO; val = be32_to_cpup(p++); /* headerpadsz */ if (val) return -EINVAL; /* no support for header padding yet */ @@ -5816,12 +5625,9 @@ static int decode_chan_attrs(struct xdr_stream *xdr, if (nr_attrs == 1) { p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ if (unlikely(!p)) - goto out_overflow; + return -EIO; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) @@ -5844,7 +5650,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr, /* dir flags, rdma mode bool */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->dir = be32_to_cpup(p++); if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH) @@ -5855,9 +5661,6 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr, res->use_conn_in_rdma_mode = true; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_create_session(struct xdr_stream *xdr, @@ -5875,7 +5678,7 @@ static int decode_create_session(struct xdr_stream *xdr, /* seqid, flags */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->seqid = be32_to_cpup(p++); res->flags = be32_to_cpup(p); @@ -5884,9 +5687,6 @@ static int decode_create_session(struct xdr_stream *xdr, if (!status) status = decode_chan_attrs(xdr, &res->bc_attrs); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) @@ -5967,7 +5767,6 @@ out_err: res->sr_status = status; return status; out_overflow: - print_overflow_msg(__func__, xdr); status = -EIO; goto out_err; #else /* CONFIG_NFS_V4_1 */ @@ -5995,7 +5794,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, if (status == -ETOOSMALL) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; pdev->mincount = be32_to_cpup(p); dprintk("%s: Min count too small. mincnt = %u\n", __func__, pdev->mincount); @@ -6005,7 +5804,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; type = be32_to_cpup(p++); if (type != pdev->layout_type) { dprintk("%s: layout mismatch req: %u pdev: %u\n", @@ -6019,19 +5818,19 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, */ pdev->mincount = be32_to_cpup(p); if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount) - goto out_overflow; + return -EIO; /* Parse notification bitmap, verifying that it is zero. */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len) { uint32_t i; p = xdr_inline_decode(xdr, 4 * len); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->notification = be32_to_cpup(p++); for (i = 1; i < len; i++) { @@ -6043,9 +5842,6 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, } } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, @@ -6115,7 +5911,6 @@ out: res->status = status; return status; out_overflow: - print_overflow_msg(__func__, xdr); status = -EIO; goto out; } @@ -6131,16 +5926,13 @@ static int decode_layoutreturn(struct xdr_stream *xdr, return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->lrs_present = be32_to_cpup(p); if (res->lrs_present) status = decode_layout_stateid(xdr, &res->stateid); else nfs4_stateid_copy(&res->stateid, &invalid_stateid); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutcommit(struct xdr_stream *xdr, @@ -6158,19 +5950,16 @@ static int decode_layoutcommit(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; sizechanged = be32_to_cpup(p); if (sizechanged) { /* throw away new size */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_test_stateid(struct xdr_stream *xdr, @@ -6186,21 +5975,17 @@ static int decode_test_stateid(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; num_res = be32_to_cpup(p++); if (num_res != 1) - goto out; + return -EIO; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->status = be32_to_cpup(p++); return status; -out_overflow: - print_overflow_msg(__func__, xdr); -out: - return -EIO; } static int decode_free_stateid(struct xdr_stream *xdr, @@ -7570,11 +7355,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, uint64_t new_cookie; __be32 *p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; if (*p == xdr_zero) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; if (*p == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -7583,13 +7368,13 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; p = xdr_decode_hyper(p, &new_cookie); entry->len = be32_to_cpup(p); p = xdr_inline_decode(xdr, entry->len); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; entry->name = (const char *) p; /* @@ -7601,14 +7386,14 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->fattr->valid = 0; if (decode_attr_bitmap(xdr, bitmap) < 0) - goto out_overflow; + return -EAGAIN; if (decode_attr_length(xdr, &len, &savep) < 0) - goto out_overflow; + return -EAGAIN; if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, NULL, entry->label, entry->server) < 0) - goto out_overflow; + return -EAGAIN; if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) entry->ino = entry->fattr->mounted_on_fileid; else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) @@ -7622,10 +7407,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->cookie = new_cookie; return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; } /* @@ -7791,6 +7572,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(COPY, enc_copy, dec_copy), PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel), PROC(LOOKUPP, enc_lookupp, dec_lookupp), + PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c index b60d5fbd7727..a90b363500c2 100644 --- a/fs/nfs/nfstrace.c +++ b/fs/nfs/nfstrace.c @@ -11,3 +11,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index bd60f8d1e181..a0d6910aa03a 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -969,6 +969,91 @@ TRACE_EVENT(nfs_commit_done, ) ); +TRACE_DEFINE_ENUM(NFS_OK); +TRACE_DEFINE_ENUM(NFSERR_PERM); +TRACE_DEFINE_ENUM(NFSERR_NOENT); +TRACE_DEFINE_ENUM(NFSERR_IO); +TRACE_DEFINE_ENUM(NFSERR_NXIO); +TRACE_DEFINE_ENUM(NFSERR_ACCES); +TRACE_DEFINE_ENUM(NFSERR_EXIST); +TRACE_DEFINE_ENUM(NFSERR_XDEV); +TRACE_DEFINE_ENUM(NFSERR_NODEV); +TRACE_DEFINE_ENUM(NFSERR_NOTDIR); +TRACE_DEFINE_ENUM(NFSERR_ISDIR); +TRACE_DEFINE_ENUM(NFSERR_INVAL); +TRACE_DEFINE_ENUM(NFSERR_FBIG); +TRACE_DEFINE_ENUM(NFSERR_NOSPC); +TRACE_DEFINE_ENUM(NFSERR_ROFS); +TRACE_DEFINE_ENUM(NFSERR_MLINK); +TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG); +TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY); +TRACE_DEFINE_ENUM(NFSERR_DQUOT); +TRACE_DEFINE_ENUM(NFSERR_STALE); +TRACE_DEFINE_ENUM(NFSERR_REMOTE); +TRACE_DEFINE_ENUM(NFSERR_WFLUSH); +TRACE_DEFINE_ENUM(NFSERR_BADHANDLE); +TRACE_DEFINE_ENUM(NFSERR_NOT_SYNC); +TRACE_DEFINE_ENUM(NFSERR_BAD_COOKIE); +TRACE_DEFINE_ENUM(NFSERR_NOTSUPP); +TRACE_DEFINE_ENUM(NFSERR_TOOSMALL); +TRACE_DEFINE_ENUM(NFSERR_SERVERFAULT); +TRACE_DEFINE_ENUM(NFSERR_BADTYPE); +TRACE_DEFINE_ENUM(NFSERR_JUKEBOX); + +#define nfs_show_status(x) \ + __print_symbolic(x, \ + { NFS_OK, "OK" }, \ + { NFSERR_PERM, "PERM" }, \ + { NFSERR_NOENT, "NOENT" }, \ + { NFSERR_IO, "IO" }, \ + { NFSERR_NXIO, "NXIO" }, \ + { NFSERR_ACCES, "ACCES" }, \ + { NFSERR_EXIST, "EXIST" }, \ + { NFSERR_XDEV, "XDEV" }, \ + { NFSERR_NODEV, "NODEV" }, \ + { NFSERR_NOTDIR, "NOTDIR" }, \ + { NFSERR_ISDIR, "ISDIR" }, \ + { NFSERR_INVAL, "INVAL" }, \ + { NFSERR_FBIG, "FBIG" }, \ + { NFSERR_NOSPC, "NOSPC" }, \ + { NFSERR_ROFS, "ROFS" }, \ + { NFSERR_MLINK, "MLINK" }, \ + { NFSERR_NAMETOOLONG, "NAMETOOLONG" }, \ + { NFSERR_NOTEMPTY, "NOTEMPTY" }, \ + { NFSERR_DQUOT, "DQUOT" }, \ + { NFSERR_STALE, "STALE" }, \ + { NFSERR_REMOTE, "REMOTE" }, \ + { NFSERR_WFLUSH, "WFLUSH" }, \ + { NFSERR_BADHANDLE, "BADHANDLE" }, \ + { NFSERR_NOT_SYNC, "NOTSYNC" }, \ + { NFSERR_BAD_COOKIE, "BADCOOKIE" }, \ + { NFSERR_NOTSUPP, "NOTSUPP" }, \ + { NFSERR_TOOSMALL, "TOOSMALL" }, \ + { NFSERR_SERVERFAULT, "REMOTEIO" }, \ + { NFSERR_BADTYPE, "BADTYPE" }, \ + { NFSERR_JUKEBOX, "JUKEBOX" }) + +TRACE_EVENT(nfs_xdr_status, + TP_PROTO( + int error + ), + + TP_ARGS(error), + + TP_STRUCT__entry( + __field(int, error) + ), + + TP_fast_assign( + __entry->error = error; + ), + + TP_printk( + "error=%d (%s)", + __entry->error, nfs_show_status(__entry->error) + ) +); + #endif /* _TRACE_NFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e54d899c1848..e9f39fa5964b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -350,7 +350,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, /** * nfs_unlock_request - Unlock request and wake up sleepers. - * @req: + * @req: pointer to request */ void nfs_unlock_request(struct nfs_page *req) { @@ -368,7 +368,7 @@ void nfs_unlock_request(struct nfs_page *req) /** * nfs_unlock_and_release_request - Unlock request and release the nfs_page - * @req: + * @req: pointer to request */ void nfs_unlock_and_release_request(struct nfs_page *req) { @@ -531,7 +531,6 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * nfs_pgio_rpcsetup - Set up arguments for a pageio call * @hdr: The pageio hdr * @count: Number of bytes to read - * @offset: Initial offset * @how: How to commit data (writes only) * @cinfo: Commit information for the call (writes only) */ @@ -634,7 +633,6 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio); /** * nfs_pgio_error - Clean up from a pageio error - * @desc: IO descriptor * @hdr: pageio header */ static void nfs_pgio_error(struct nfs_pgio_header *hdr) @@ -768,8 +766,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, pageused = 0; while (!list_empty(head)) { req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); + nfs_list_move_request(req, &hdr->pages); if (!last_page || last_page != req->wb_page) { pageused++; @@ -893,6 +890,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1, * nfs_can_coalesce_requests - test two requests for compatibility * @prev: pointer to nfs_page * @req: pointer to nfs_page + * @pgio: pointer to nfs_pagio_descriptor * * The nfs_page structures 'prev' and 'req' are compared to ensure that the * page data area they describe is contiguous, and that their RPC @@ -961,8 +959,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, } if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; - nfs_list_remove_request(req); - nfs_list_add_request(req, &mirror->pg_list); + nfs_list_move_request(req, &mirror->pg_list); mirror->pg_count += req->wb_bytes; return 1; } @@ -988,6 +985,16 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) } } +static void +nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + LIST_HEAD(head); + + nfs_list_move_request(req, &head); + desc->pg_completion_ops->error_cleanup(&head, desc->pg_error); +} + /** * nfs_pageio_add_request - Attempt to coalesce a request into a page list. * @desc: destination io descriptor @@ -1025,10 +1032,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, nfs_page_group_unlock(req); desc->pg_moreio = 1; nfs_pageio_doio(desc); - if (desc->pg_error < 0) - return 0; - if (mirror->pg_recoalesce) - return 0; + if (desc->pg_error < 0 || mirror->pg_recoalesce) + goto out_cleanup_subreq; /* retry add_request for this subreq */ nfs_page_group_lock(req); continue; @@ -1061,6 +1066,10 @@ err_ptr: desc->pg_error = PTR_ERR(subreq); nfs_page_group_unlock(req); return 0; +out_cleanup_subreq: + if (req != subreq) + nfs_pageio_cleanup_request(desc, subreq); + return 0; } static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) @@ -1079,7 +1088,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) struct nfs_page *req; req = list_first_entry(&head, struct nfs_page, wb_list); - nfs_list_remove_request(req); if (__nfs_pageio_add_request(desc, req)) continue; if (desc->pg_error < 0) { @@ -1120,7 +1128,8 @@ static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc) for (midx = 0; midx < desc->pg_mirror_count; midx++) { mirror = &desc->pg_mirrors[midx]; - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + desc->pg_completion_ops->error_cleanup(&mirror->pg_list, + desc->pg_error); } } @@ -1168,11 +1177,14 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (nfs_pgio_has_mirroring(desc)) desc->pg_mirror_idx = midx; if (!nfs_pageio_add_request_mirror(desc, dupreq)) - goto out_failed; + goto out_cleanup_subreq; } return 1; +out_cleanup_subreq: + if (req != dupreq) + nfs_pageio_cleanup_request(desc, dupreq); out_failed: nfs_pageio_error_cleanup(desc); return 0; @@ -1194,7 +1206,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, desc->pg_mirror_idx = mirror_idx; for (;;) { nfs_pageio_doio(desc); - if (!mirror->pg_recoalesce) + if (desc->pg_error < 0 || !mirror->pg_recoalesce) break; if (!nfs_do_recoalesce(desc)) break; @@ -1222,9 +1234,8 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); - nfs_list_remove_request(req); if (!nfs_pageio_add_request(desc, req)) - nfs_list_add_request(req, &failed); + nfs_list_move_request(req, &failed); } nfs_pageio_complete(desc); if (!list_empty(&failed)) { diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 53726da5c010..7066cd7c7aff 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -758,22 +758,35 @@ static int pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, struct nfs_server *server, struct list_head *layout_list) + __must_hold(&clp->cl_lock) + __must_hold(RCU) { struct pnfs_layout_hdr *lo, *next; struct inode *inode; list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { - if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) + if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || + test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) || + !list_empty(&lo->plh_bulk_destroy)) continue; + /* If the sb is being destroyed, just bail */ + if (!nfs_sb_active(server->super)) + break; inode = igrab(lo->plh_inode); - if (inode == NULL) - continue; - list_del_init(&lo->plh_layouts); - if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) - continue; - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - iput(inode); + if (inode != NULL) { + list_del_init(&lo->plh_layouts); + if (pnfs_layout_add_bulk_destroy_list(inode, + layout_list)) + continue; + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + iput(inode); + } else { + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags); + } + nfs_sb_deactive(server->super); spin_lock(&clp->cl_lock); rcu_read_lock(); return -EAGAIN; @@ -811,7 +824,7 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, /* Free all lsegs that are attached to commit buckets */ nfs_commit_inode(inode, 0); pnfs_put_layout_hdr(lo); - iput(inode); + nfs_iput_and_deactive(inode); } return ret; } @@ -1876,7 +1889,7 @@ lookup_again: atomic_read(&lo->plh_outstanding) != 0) { spin_unlock(&ino->i_lock); lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, - atomic_read(&lo->plh_outstanding))); + !atomic_read(&lo->plh_outstanding))); if (IS_ERR(lseg) || !list_empty(&lo->plh_segs)) goto out_put_layout_hdr; pnfs_put_layout_hdr(lo); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 5e80a07b7bea..c0420b979d88 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -104,6 +104,7 @@ enum { NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ + NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */ }; enum layoutdriver_policy_flags { @@ -349,6 +350,7 @@ void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nf void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *, const struct nfs4_deviceid *); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); +void nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node); void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 7fb59487ee90..537b80d693f1 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -284,10 +284,22 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); void +nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node) +{ + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { + clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); + } +} +EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_available); + +void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node) { node->timestamp_unavailable = jiffies; + smp_mb__before_atomic(); set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); } EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable); @@ -302,6 +314,7 @@ nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node) if (time_in_range(node->timestamp_unavailable, start, end)) return true; clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); } return false; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f9f19784db82..1d95a60b2586 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -205,7 +205,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr, } static void -nfs_async_read_error(struct list_head *head) +nfs_async_read_error(struct list_head *head, int error) { struct nfs_page *req; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0570391eaa16..c27ac96a95bd 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1919,7 +1919,7 @@ static int nfs_parse_devname(const char *dev_name, /* kill possible hostname list: not supported */ comma = strchr(dev_name, ','); if (comma != NULL && comma < end) - *comma = 0; + len = comma - dev_name; } if (len > maxnamlen) @@ -2041,7 +2041,8 @@ static int nfs23_validate_mount_data(void *options, memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); args->nfs_server.port = ntohs(data->addr.sin_port); - if (!nfs_verify_server_address(sap)) + if (sap->sa_family != AF_INET || + !nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 79b97b3c4427..52d533967485 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -39,6 +39,7 @@ nfs_free_unlinkdata(struct nfs_unlinkdata *data) /** * nfs_async_unlink_done - Sillydelete post-processing * @task: rpc_task of the sillydelete + * @calldata: pointer to nfs_unlinkdata * * Do the directory attribute update. */ @@ -54,7 +55,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) /** * nfs_async_unlink_release - Release the sillydelete data. - * @task: rpc_task of the sillydelete + * @calldata: struct nfs_unlinkdata to release * * We need to call nfs_put_unlinkdata as a 'tk_release' task since the * rpc_task would be freed too. @@ -159,8 +160,8 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf /** * nfs_async_unlink - asynchronous unlinking of a file - * @dir: parent directory of dentry - * @dentry: dentry to unlink + * @dentry: parent directory of dentry + * @name: name of dentry to unlink */ static int nfs_async_unlink(struct dentry *dentry, const struct qstr *name) @@ -324,6 +325,7 @@ static const struct rpc_call_ops nfs_rename_ops = { * @new_dir: target directory for the rename * @old_dentry: original dentry to be renamed * @new_dentry: dentry to which the old_dentry should be renamed + * @complete: Function to run on successful completion * * It's expected that valid references to the dentries and inodes are held */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d09c9f878141..f3ebabaa291d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -26,6 +26,7 @@ #include <linux/iversion.h> #include <linux/uaccess.h> +#include <linux/sched/mm.h> #include "delegation.h" #include "internal.h" @@ -712,11 +713,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct nfs_pageio_descriptor pgio; - struct nfs_io_completion *ioc = nfs_io_completion_alloc(GFP_NOFS); + struct nfs_io_completion *ioc; + unsigned int pflags = memalloc_nofs_save(); int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); + ioc = nfs_io_completion_alloc(GFP_NOFS); if (ioc) nfs_io_completion_init(ioc, nfs_io_completion_commit, inode); @@ -727,6 +730,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_pageio_complete(&pgio); nfs_io_completion_put(ioc); + memalloc_nofs_restore(pflags); + if (err < 0) goto out_err; err = pgio.pg_error; @@ -865,7 +870,6 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); /** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page - * @dst: commit list head * @cinfo: holds list lock and accounting info * * This sets the PG_CLEAN bit, updates the cinfo count of @@ -1412,20 +1416,27 @@ static void nfs_redirty_request(struct nfs_page *req) nfs_release_request(req); } -static void nfs_async_write_error(struct list_head *head) +static void nfs_async_write_error(struct list_head *head, int error) { struct nfs_page *req; while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); + if (nfs_error_is_fatal(error)) { + nfs_context_set_write_error(req->wb_context, error); + if (nfs_error_is_fatal_on_server(error)) { + nfs_write_error_remove_page(req); + continue; + } + } nfs_redirty_request(req); } } static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr) { - nfs_async_write_error(&hdr->pages); + nfs_async_write_error(&hdr->pages, 0); filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset, hdr->args.offset + hdr->args.count - 1); } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9eb8086ea841..9bc32af4e2da 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -442,7 +442,9 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; __be32 nfserr; - int count; + int count = 0; + struct page **p; + caddr_t page_addr = NULL; dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -462,9 +464,31 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie, &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); - resp->count = resp->buffer - argp->buffer; - if (resp->offset) - xdr_encode_hyper(resp->offset, argp->cookie); + count = 0; + for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { + page_addr = page_address(*p); + + if (((caddr_t)resp->buffer >= page_addr) && + ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { + count += (caddr_t)resp->buffer - page_addr; + break; + } + count += PAGE_SIZE; + } + resp->count = count >> 2; + if (resp->offset) { + loff_t offset = argp->cookie; + + if (unlikely(resp->offset1)) { + /* we ended up with offset on a page boundary */ + *resp->offset = htonl(offset >> 32); + *resp->offset1 = htonl(offset & 0xffffffff); + resp->offset1 = NULL; + } else { + xdr_encode_hyper(resp->offset, offset); + } + resp->offset = NULL; + } RETURN_STATUS(nfserr); } @@ -533,6 +557,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) } else { xdr_encode_hyper(resp->offset, offset); } + resp->offset = NULL; } RETURN_STATUS(nfserr); @@ -576,7 +601,7 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp) resp->f_wtmax = max_blocksize; resp->f_wtpref = max_blocksize; resp->f_wtmult = PAGE_SIZE; - resp->f_dtpref = PAGE_SIZE; + resp->f_dtpref = max_blocksize; resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9b973f4f7d01..8d789124ed3c 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -573,6 +573,9 @@ int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readdirargs *args = rqstp->rq_argp; + int len; + u32 max_blocksize = svc_max_payload(rqstp); + p = decode_fh(p, &args->fh); if (!p) return 0; @@ -580,8 +583,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); - args->count = min_t(u32, args->count, PAGE_SIZE); - args->buffer = page_address(*(rqstp->rq_next_page++)); + len = args->count = min_t(u32, args->count, max_blocksize); + + while (len > 0) { + struct page *p = *(rqstp->rq_next_page++); + if (!args->buffer) + args->buffer = page_address(p); + len -= PAGE_SIZE; + } return xdr_argsize_check(rqstp, p); } @@ -921,6 +930,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, } else { xdr_encode_hyper(cd->offset, offset64); } + cd->offset = NULL; } /* diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c74e4538d0eb..7caa3801ce72 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -60,16 +60,6 @@ struct nfs4_cb_compound_hdr { int status; }; -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - static __be32 *xdr_encode_empty_array(__be32 *p) { *p++ = xdr_zero; @@ -240,7 +230,6 @@ static int decode_cb_op_status(struct xdr_stream *xdr, *status = nfs_cb_stat_to_errno(be32_to_cpup(p)); return 0; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; out_unexpected: dprintk("NFSD: Callback server returned operation %d but " @@ -309,7 +298,6 @@ static int decode_cb_compound4res(struct xdr_stream *xdr, hdr->nops = be32_to_cpup(p); return 0; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -437,7 +425,6 @@ out: cb->cb_seq_status = status; return status; out_overflow: - print_overflow_msg(__func__, xdr); status = -EIO; goto out; } @@ -913,9 +900,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c return PTR_ERR(client); } cred = get_backchannel_cred(clp, client, ses); - if (IS_ERR(cred)) { + if (!cred) { rpc_shutdown_client(client); - return PTR_ERR(cred); + return -ENOMEM; } clp->cl_cb_client = client; clp->cl_cb_cred = cred; @@ -1023,8 +1010,9 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) cb->cb_seq_status = 1; cb->cb_status = 0; if (minorversion) { - if (!nfsd41_cb_get_slot(clp, task)) + if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task)) return; + cb->cb_holds_slot = true; } rpc_call_start(task); } @@ -1051,6 +1039,9 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback return true; } + if (!cb->cb_holds_slot) + goto need_restart; + switch (cb->cb_seq_status) { case 0: /* @@ -1089,6 +1080,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback cb->cb_seq_status); } + cb->cb_holds_slot = false; clear_bit(0, &clp->cl_cb_slot_busy); rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, @@ -1296,6 +1288,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_seq_status = 1; cb->cb_status = 0; cb->cb_need_restart = false; + cb->cb_holds_slot = false; } void nfsd4_run_cb(struct nfsd4_callback *cb) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 5188f9f70c78..8c8563441208 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -126,7 +126,6 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) SHASH_DESC_ON_STACK(desc, tfm); desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; status = crypto_shash_digest(desc, clname->data, clname->len, cksum.data); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fb3c9844c82a..f056b1d3fecd 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -265,6 +265,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, static void free_blocked_lock(struct nfsd4_blocked_lock *nbl) { + locks_delete_block(&nbl->nbl_lock); locks_release_private(&nbl->nbl_lock); kfree(nbl); } @@ -293,11 +294,18 @@ remove_blocked_locks(struct nfs4_lockowner *lo) nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); - locks_delete_block(&nbl->nbl_lock); free_blocked_lock(nbl); } } +static void +nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb) +{ + struct nfsd4_blocked_lock *nbl = container_of(cb, + struct nfsd4_blocked_lock, nbl_cb); + locks_delete_block(&nbl->nbl_lock); +} + static int nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) { @@ -325,6 +333,7 @@ nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb) } static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { + .prepare = nfsd4_cb_notify_lock_prepare, .done = nfsd4_cb_notify_lock_done, .release = nfsd4_cb_notify_lock_release, }; @@ -1544,16 +1553,16 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) { u32 slotsize = slot_bytes(ca); u32 num = ca->maxreqs; - int avail; + unsigned long avail, total_avail; spin_lock(&nfsd_drc_lock); - avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, - nfsd_drc_max_mem - nfsd_drc_mem_used); + total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* * Never use more than a third of the remaining memory, * unless it's the only way to give this client a slot: */ - avail = clamp_t(int, avail, slotsize, avail/3); + avail = clamp_t(int, avail, slotsize, total_avail/3); num = min_t(int, num, avail / slotsize); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); @@ -4863,7 +4872,6 @@ nfs4_laundromat(struct nfsd_net *nn) nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); - locks_delete_block(&nbl->nbl_lock); free_blocked_lock(nbl); } out: diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 72a7681f4046..f2feb2d11bae 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1126,7 +1126,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case 'Y': case 'y': case '1': - if (nn->nfsd_serv) + if (!nn->nfsd_serv) return -EBUSY; nfsd4_end_grace(nn); break; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 396c76755b03..9d6cb246c6c5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -70,6 +70,7 @@ struct nfsd4_callback { int cb_seq_status; int cb_status; bool cb_need_restart; + bool cb_holds_slot; }; struct nfsd4_callback_ops { diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index f2129a5d9f23..4391fd3abd8f 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -189,7 +189,7 @@ retry: */ if (!err) return 0; - else if (err != -EEXIST) + else if (err != -EBUSY) goto failed_unlock; err = invalidate_inode_pages2_range(btnc, newkey, newkey); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 6b9c27548997..63c6bb1f8c4d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -346,10 +346,16 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) __kernel_fsid_t fsid = {}; fsnotify_foreach_obj_type(type) { + struct fsnotify_mark_connector *conn; + if (!fsnotify_iter_should_report_type(iter_info, type)) continue; - fsid = iter_info->marks[type]->connector->fsid; + conn = READ_ONCE(iter_info->marks[type]->connector); + /* Mark is just getting destroyed or created? */ + if (!conn) + continue; + fsid = conn->fsid; if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) continue; return fsid; @@ -408,8 +414,12 @@ static int fanotify_handle_event(struct fsnotify_group *group, return 0; } - if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { fsid = fanotify_get_fsid(iter_info); + /* Racing with mark destruction or creation? */ + if (!fsid.val[0] && !fsid.val[1]) + return 0; + } event = fanotify_alloc_event(group, inode, mask, data, data_type, &fsid); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 56992b32c6bb..a90bb19dcfa2 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -208,6 +208,7 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; + unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh; size_t fh_len = event->fh_len; size_t len = fanotify_event_info_len(event); @@ -233,7 +234,16 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) buf += sizeof(handle); len -= sizeof(handle); - if (copy_to_user(buf, fanotify_event_fh(event), fh_len)) + /* + * For an inline fh, copy through stack to exclude the copy from + * usercopy hardening protections. + */ + fh = fanotify_event_fh(event); + if (fh_len <= FANOTIFY_INLINE_FH_LEN) { + memcpy(bounce, fh, fh_len); + fh = bounce; + } + if (copy_to_user(buf, fh, fh_len)) return -EFAULT; /* Pad with 0's */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index e2901fbb9f76..7b53598c8804 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -519,8 +519,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) return -ENOENT; - else if (create) - return -EEXIST; + else if (create) { + ret = -EEXIST; + goto out; + } i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); @@ -548,6 +550,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* return the wd */ ret = i_mark->wd; +out: /* match the get from fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d593d4269561..22acb0a79b53 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -239,13 +239,13 @@ static void fsnotify_drop_object(unsigned int type, void *objp) void fsnotify_put_mark(struct fsnotify_mark *mark) { - struct fsnotify_mark_connector *conn; + struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector); void *objp = NULL; unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ - if (!mark->connector) { + if (!conn) { if (refcount_dec_and_test(&mark->refcnt)) fsnotify_final_mark_destroy(mark); return; @@ -255,10 +255,9 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) * We have to be careful so that traversals of obj_list under lock can * safely grab mark reference. */ - if (!refcount_dec_and_lock(&mark->refcnt, &mark->connector->lock)) + if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock)) return; - conn = mark->connector; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { objp = fsnotify_detach_connector_from_object(conn, &type); @@ -266,7 +265,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) } else { __fsnotify_recalc_mask(conn); } - mark->connector = NULL; + WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); fsnotify_drop_object(type, objp); @@ -620,7 +619,7 @@ restart: /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: - mark->connector = conn; + WRITE_ONCE(mark->connector, conn); out_err: spin_unlock(&conn->lock); spin_unlock(&mark->lock); @@ -808,6 +807,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, refcount_set(&mark->refcnt, 1); fsnotify_get_group(group); mark->group = group; + WRITE_ONCE(mark->connector, NULL); } /* diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index a35259eebc56..1dc9a08e8bdc 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4719,22 +4719,23 @@ out: /* Lock an inode and grab a bh pointing to the inode. */ int ocfs2_reflink_inodes_lock(struct inode *s_inode, - struct buffer_head **bh1, + struct buffer_head **bh_s, struct inode *t_inode, - struct buffer_head **bh2) + struct buffer_head **bh_t) { - struct inode *inode1; - struct inode *inode2; + struct inode *inode1 = s_inode; + struct inode *inode2 = t_inode; struct ocfs2_inode_info *oi1; struct ocfs2_inode_info *oi2; + struct buffer_head *bh1 = NULL; + struct buffer_head *bh2 = NULL; bool same_inode = (s_inode == t_inode); + bool need_swap = (inode1->i_ino > inode2->i_ino); int status; /* First grab the VFS and rw locks. */ lock_two_nondirectories(s_inode, t_inode); - inode1 = s_inode; - inode2 = t_inode; - if (inode1->i_ino > inode2->i_ino) + if (need_swap) swap(inode1, inode2); status = ocfs2_rw_lock(inode1, 1); @@ -4757,17 +4758,13 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode, trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, (unsigned long long)oi2->ip_blkno); - if (*bh1) - *bh1 = NULL; - if (*bh2) - *bh2 = NULL; - /* We always want to lock the one with the lower lockid first. */ if (oi1->ip_blkno > oi2->ip_blkno) mlog_errno(-ENOLCK); /* lock id1 */ - status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET); + status = ocfs2_inode_lock_nested(inode1, &bh1, 1, + OI_LS_REFLINK_TARGET); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -4776,15 +4773,25 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode, /* lock id2 */ if (!same_inode) { - status = ocfs2_inode_lock_nested(inode2, bh2, 1, + status = ocfs2_inode_lock_nested(inode2, &bh2, 1, OI_LS_REFLINK_TARGET); if (status < 0) { if (status != -ENOENT) mlog_errno(status); goto out_cl1; } - } else - *bh2 = *bh1; + } else { + bh2 = bh1; + } + + /* + * If we swapped inode order above, we have to swap the buffer heads + * before passing them back to the caller. + */ + if (need_swap) + swap(bh1, bh2); + *bh_s = bh1; + *bh_t = bh2; trace_ocfs2_double_lock_end( (unsigned long long)oi1->ip_blkno, @@ -4794,8 +4801,7 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode, out_cl1: ocfs2_inode_unlock(inode1, 1); - brelse(*bh1); - *bh1 = NULL; + brelse(bh1); out_rw2: ocfs2_rw_unlock(inode2, 1); out_i2: diff --git a/fs/open.c b/fs/open.c index 0285ce7dbd51..a00350018a47 100644 --- a/fs/open.c +++ b/fs/open.c @@ -733,6 +733,12 @@ static int do_dentry_open(struct file *f, return 0; } + /* Any file opened for execve()/uselib() has to be a regular file. */ + if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) { + error = -EACCES; + goto cleanup_file; + } + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) @@ -1209,3 +1215,21 @@ int nonseekable_open(struct inode *inode, struct file *filp) } EXPORT_SYMBOL(nonseekable_open); + +/* + * stream_open is used by subsystems that want stream-like file descriptors. + * Such file descriptors are not seekable and don't have notion of position + * (file.f_pos is always 0). Contrary to file descriptors of other regular + * files, .read() and .write() can run simultaneously. + * + * stream_open never fails and is marked to return int so that it could be + * directly used as file_operations.open . + */ +int stream_open(struct inode *inode, struct file *filp) +{ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); + filp->f_mode |= FMODE_STREAM; + return 0; +} + +EXPORT_SYMBOL(stream_open); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index f038235c64bd..c3334eca18c7 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -261,11 +261,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, generic_fillattr(inode, stat); /* override block size reported to stat */ - if (request_mask & STATX_SIZE) - stat->result_mask = STATX_BASIC_STATS; - else - stat->result_mask = STATX_BASIC_STATS & - ~STATX_SIZE; + if (!(request_mask & STATX_SIZE)) + stat->result_mask &= ~STATX_SIZE; stat->attributes_mask = STATX_ATTR_IMMUTABLE | STATX_ATTR_APPEND; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 9e62dcf06fc4..68b3303e4b46 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -443,6 +443,24 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { int err; + /* + * Copy up data first and then xattrs. Writing data after + * xattrs will remove security.capability xattr automatically. + */ + if (S_ISREG(c->stat.mode) && !c->metacopy) { + struct path upperpath, datapath; + + ovl_path_upper(c->dentry, &upperpath); + if (WARN_ON(upperpath.dentry != NULL)) + return -EIO; + upperpath.dentry = temp; + + ovl_path_lowerdata(c->dentry, &datapath); + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + if (err) + return err; + } + err = ovl_copy_xattr(c->lowerpath.dentry, temp); if (err) return err; @@ -460,19 +478,6 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } - if (S_ISREG(c->stat.mode) && !c->metacopy) { - struct path upperpath, datapath; - - ovl_path_upper(c->dentry, &upperpath); - BUG_ON(upperpath.dentry != NULL); - upperpath.dentry = temp; - - ovl_path_lowerdata(c->dentry, &datapath); - err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); - if (err) - return err; - } - if (c->metacopy) { err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY, NULL, 0, -EOPNOTSUPP); @@ -737,6 +742,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) { struct path upperpath, datapath; int err; + char *capability = NULL; + ssize_t uninitialized_var(cap_size); ovl_path_upper(c->dentry, &upperpath); if (WARN_ON(upperpath.dentry == NULL)) @@ -746,15 +753,37 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) if (WARN_ON(datapath.dentry == NULL)) return -EIO; + if (c->stat.size) { + err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS, + &capability, 0); + if (err < 0 && err != -ENODATA) + goto out; + } + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); if (err) - return err; + goto out_free; + + /* + * Writing to upper file will clear security.capability xattr. We + * don't want that to happen for normal copy-up operation. + */ + if (capability) { + err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS, + capability, cap_size, 0); + if (err) + goto out_free; + } + err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY); if (err) - return err; + goto out_free; ovl_set_upperdata(d_inode(c->dentry)); +out_free: + kfree(capability); +out: return err; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 5e45cb3630a0..9c6018287d57 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -277,6 +277,8 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); int ovl_check_metacopy_xattr(struct dentry *dentry); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct dentry *dentry, int padding); +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding); static inline bool ovl_is_impuredir(struct dentry *dentry) { diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 7c01327b1852..4035e640f402 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -863,28 +863,49 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry) return (oe->numlower > 1); } -char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding) { - int res; - char *s, *next, *buf = NULL; + ssize_t res; + char *buf = NULL; - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0); + res = vfs_getxattr(dentry, name, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) - return NULL; + return -ENODATA; goto fail; } - buf = kzalloc(res + padding + 1, GFP_KERNEL); - if (!buf) - return ERR_PTR(-ENOMEM); + if (res != 0) { + buf = kzalloc(res + padding, GFP_KERNEL); + if (!buf) + return -ENOMEM; - if (res == 0) - goto invalid; + res = vfs_getxattr(dentry, name, buf, res); + if (res < 0) + goto fail; + } + *value = buf; + + return res; + +fail: + pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n", + name, res); + kfree(buf); + return res; +} - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res); +char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +{ + int res; + char *s, *next, *buf = NULL; + + res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1); + if (res == -ENODATA) + return NULL; if (res < 0) - goto fail; + return ERR_PTR(res); if (res == 0) goto invalid; @@ -900,15 +921,9 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) } return buf; - -err_free: - kfree(buf); - return ERR_PTR(res); -fail: - pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res); - goto err_free; invalid: pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); res = -EINVAL; - goto err_free; + kfree(buf); + return ERR_PTR(res); } diff --git a/fs/pipe.c b/fs/pipe.c index 51d5fd8840ab..41065901106b 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal); * in the tee() system call, when we duplicate the buffers in one * pipe into another. */ -void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - get_page(buf->page); + return try_get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); @@ -225,8 +225,15 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe, } EXPORT_SYMBOL(generic_pipe_buf_release); +/* New data written to a pipe may be appended to a buffer with this type. */ static const struct pipe_buf_operations anon_pipe_buf_ops = { - .can_merge = 1, + .confirm = generic_pipe_buf_confirm, + .release = anon_pipe_buf_release, + .steal = anon_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, @@ -234,13 +241,32 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { }; static const struct pipe_buf_operations packet_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; +/** + * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable + * @buf: the buffer to mark + * + * Description: + * This function ensures that no future writes will be merged into the + * given &struct pipe_buffer. This is necessary when multiple pipe buffers + * share the same backing page. + */ +void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) +{ + if (buf->ops == &anon_pipe_buf_ops) + buf->ops = &anon_pipe_buf_nomerge_ops; +} + +static bool pipe_buf_can_merge(struct pipe_buffer *buf) +{ + return buf->ops == &anon_pipe_buf_ops; +} + static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { @@ -378,7 +404,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) struct pipe_buffer *buf = pipe->bufs + lastbuf; int offset = buf->offset + buf->len; - if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) { + if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; diff --git a/fs/pnode.c b/fs/pnode.c index 1100e810d855..7ea6cfb65077 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -214,7 +214,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin) } /* all accesses are serialized by namespace_sem */ -static struct user_namespace *user_ns; static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct mountpoint *mp; static struct hlist_head *list; @@ -260,9 +259,6 @@ static int propagate_one(struct mount *m) type |= CL_MAKE_SHARED; } - /* Notice when we are propagating across user namespaces */ - if (m->mnt_ns->user_ns != user_ns) - type |= CL_UNPRIVILEGED; child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); @@ -303,7 +299,6 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, * propagate_one(); everything is serialized by namespace_sem, * so globals will do just fine. */ - user_ns = current->nsproxy->mnt_ns->user_ns; last_dest = dest_mnt; first_source = source_mnt; last_source = source_mnt; diff --git a/fs/pnode.h b/fs/pnode.h index dc87e65becd2..3960a83666cf 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -27,8 +27,7 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 -#define CL_UNPRIVILEGED 0x40 -#define CL_COPY_MNT_NS_FILE 0x80 +#define CL_COPY_MNT_NS_FILE 0x40 #define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) diff --git a/fs/proc/base.c b/fs/proc/base.c index 5ab1849971b4..f179568b4c76 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -59,6 +59,7 @@ #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/namei.h> @@ -92,7 +93,6 @@ #include <linux/sched/coredump.h> #include <linux/sched/debug.h> #include <linux/sched/stat.h> -#include <linux/flex_array.h> #include <linux/posix-timers.h> #include <trace/events/oom.h> #include "internal.h" @@ -407,7 +407,6 @@ static void unlock_trace(struct task_struct *task) static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - struct stack_trace trace; unsigned long *entries; int err; @@ -430,20 +429,17 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, if (!entries) return -ENOMEM; - trace.nr_entries = 0; - trace.max_entries = MAX_STACK_TRACE_DEPTH; - trace.entries = entries; - trace.skip = 0; - err = lock_trace(task); if (!err) { - unsigned int i; + unsigned int i, nr_entries; - save_stack_trace_tsk(task, &trace); + nr_entries = stack_trace_save_tsk(task, entries, + MAX_STACK_TRACE_DEPTH, 0); - for (i = 0; i < trace.nr_entries; i++) { + for (i = 0; i < nr_entries; i++) { seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); } + unlock_trace(task); } kfree(entries); @@ -489,10 +485,9 @@ static int lstats_show_proc(struct seq_file *m, void *v) lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (bt == ULONG_MAX) - break; seq_printf(m, " %ps", (void *)bt); } seq_putc(m, '\n'); @@ -616,24 +611,25 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - long nr; - unsigned long args[6], sp, pc; + struct syscall_info info; + u64 *args = &info.data.args[0]; int res; res = lock_trace(task); if (res) return res; - if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) + if (task_current_syscall(task, &info)) seq_puts(m, "running\n"); - else if (nr < 0) - seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + else if (info.data.nr < 0) + seq_printf(m, "%d 0x%llx 0x%llx\n", + info.data.nr, info.sp, info.data.instruction_pointer); else seq_printf(m, - "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", - nr, + "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n", + info.data.nr, args[0], args[1], args[2], args[3], args[4], args[5], - sp, pc); + info.sp, info.data.instruction_pointer); unlock_trace(task); return 0; @@ -2142,11 +2138,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) struct task_struct *task; struct mm_struct *mm; unsigned long nr_files, pos, i; - struct flex_array *fa = NULL; - struct map_files_info info; + GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; + genradix_init(&fa); + ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) @@ -2178,35 +2175,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) */ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { - if (vma->vm_file && ++pos > ctx->pos) - nr_files++; - } + if (!vma->vm_file) + continue; + if (++pos <= ctx->pos) + continue; - if (nr_files) { - fa = flex_array_alloc(sizeof(info), nr_files, - GFP_KERNEL); - if (!fa || flex_array_prealloc(fa, 0, nr_files, - GFP_KERNEL)) { + p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL); + if (!p) { ret = -ENOMEM; - if (fa) - flex_array_free(fa); up_read(&mm->mmap_sem); mmput(mm); goto out_put_task; } - for (i = 0, vma = mm->mmap, pos = 2; vma; - vma = vma->vm_next) { - if (!vma->vm_file) - continue; - if (++pos <= ctx->pos) - continue; - info.start = vma->vm_start; - info.end = vma->vm_end; - info.mode = vma->vm_file->f_mode; - if (flex_array_put(fa, i++, &info, GFP_KERNEL)) - BUG(); - } + p->start = vma->vm_start; + p->end = vma->vm_end; + p->mode = vma->vm_file->f_mode; } up_read(&mm->mmap_sem); mmput(mm); @@ -2215,7 +2199,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ unsigned int len; - p = flex_array_get(fa, i); + p = genradix_ptr(&fa, i); len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, buf, len, @@ -2225,12 +2209,11 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) break; ctx->pos++; } - if (fa) - flex_array_free(fa); out_put_task: put_task_struct(task); out: + genradix_free(&fa); return ret; } @@ -2459,11 +2442,10 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry, static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, - const struct pid_entry *ents, - unsigned int nents) + const struct pid_entry *p, + const struct pid_entry *end) { struct task_struct *task = get_proc_task(dir); - const struct pid_entry *p, *last; struct dentry *res = ERR_PTR(-ENOENT); if (!task) @@ -2473,8 +2455,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir, * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons. */ - last = &ents[nents]; - for (p = ents; p < last; p++) { + for (; p < end; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) { @@ -2610,7 +2591,7 @@ static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \ { \ return proc_pident_lookup(dir, dentry, \ LSM##_attr_dir_stuff, \ - ARRAY_SIZE(LSM##_attr_dir_stuff)); \ + LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ } \ \ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ @@ -2655,7 +2636,8 @@ static struct dentry *proc_attr_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); + attr_dir_stuff, + attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff)); } static const struct inode_operations proc_attr_dir_inode_operations = { @@ -3088,10 +3070,20 @@ static const struct file_operations proc_tgid_base_operations = { .llseek = generic_file_llseek, }; +struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + if (!d_is_dir(file->f_path.dentry) || + (file->f_op != &proc_tgid_base_operations)) + return ERR_PTR(-EBADF); + + return proc_pid(file_inode(file)); +} + static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); + tgid_base_stuff, + tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff)); } static const struct inode_operations proc_tgid_base_inode_operations = { @@ -3463,7 +3455,8 @@ static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); + tid_base_stuff, + tid_base_stuff + ARRAY_SIZE(tid_base_stuff)); } static const struct file_operations proc_tid_base_operations = { diff --git a/fs/proc/inode.c b/fs/proc/inode.c index da649ccd6804..fc7e38def174 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -24,7 +24,6 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> -#include <linux/magic.h> #include <linux/uaccess.h> @@ -122,13 +121,12 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) return 0; } -static const struct super_operations proc_sops = { +const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, - .remount_fs = proc_remount, .show_options = proc_show_options, }; @@ -488,51 +486,3 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) pde_put(de); return inode; } - -int proc_fill_super(struct super_block *s, void *data, int silent) -{ - struct pid_namespace *ns = get_pid_ns(s->s_fs_info); - struct inode *root_inode; - int ret; - - if (!proc_parse_options(data, ns)) - return -EINVAL; - - /* User space would break if executables or devices appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; - s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; - s->s_blocksize = 1024; - s->s_blocksize_bits = 10; - s->s_magic = PROC_SUPER_MAGIC; - s->s_op = &proc_sops; - s->s_time_gran = 1; - - /* - * procfs isn't actually a stacking filesystem; however, there is - * too much magic going on inside it to permit stacking things on - * top of it - */ - s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; - - /* procfs dentries and inodes don't require IO to create */ - s->s_shrink.seeks = 0; - - pde_get(&proc_root); - root_inode = proc_get_inode(s, &proc_root); - if (!root_inode) { - pr_err("proc_fill_super: get root inode failed\n"); - return -ENOMEM; - } - - s->s_root = d_make_root(root_inode); - if (!s->s_root) { - pr_err("proc_fill_super: allocate dentry failed\n"); - return -ENOMEM; - } - - ret = proc_setup_self(s); - if (ret) { - return ret; - } - return proc_setup_thread_self(s); -} diff --git a/fs/proc/internal.h b/fs/proc/internal.h index ea575375f210..d1671e97f7fe 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -207,13 +207,12 @@ struct pde_opener { struct completion *c; } __randomize_layout; extern const struct inode_operations proc_link_inode_operations; - extern const struct inode_operations proc_pid_link_inode_operations; +extern const struct super_operations proc_sops; void proc_init_kmemcache(void); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); -extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); /* @@ -271,10 +270,8 @@ static inline void proc_tty_init(void) {} * root.c */ extern struct proc_dir_entry proc_root; -extern int proc_parse_options(char *options, struct pid_namespace *pid); extern void proc_self_init(void); -extern int proc_remount(struct super_block *, int *, char *); /* * task_[no]mmu.c diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index bbcc185062bb..f5834488b67d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -54,6 +54,28 @@ static LIST_HEAD(kclist_head); static DECLARE_RWSEM(kclist_lock); static int kcore_need_update = 1; +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * Same as oldmem_pfn_is_ram in vmcore + */ +static int (*mem_pfn_is_ram)(unsigned long pfn); + +int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (mem_pfn_is_ram) + return -EBUSY; + mem_pfn_is_ram = fn; + return 0; +} + +static int pfn_is_ram(unsigned long pfn) +{ + if (mem_pfn_is_ram) + return mem_pfn_is_ram(pfn); + else + return 1; +} + /* This doesn't grab kclist_lock, so it should only be used at init time. */ void __init kclist_add(struct kcore_list *new, void *addr, size_t size, int type) @@ -465,6 +487,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) goto out; } m = NULL; /* skip the list anchor */ + } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ @@ -588,7 +615,7 @@ static void __init proc_kcore_text_init(void) /* * MODULES_VADDR has no intersection with VMALLOC_ADDR. */ -struct kcore_list kcore_modules; +static struct kcore_list kcore_modules; static void __init add_modules_range(void) { if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) { diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 4d598a399bbf..7325baa8f9d4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1626,8 +1626,11 @@ static void drop_sysctl_table(struct ctl_table_header *header) if (--header->nreg) return; - put_links(header); - start_unregistering(header); + if (parent) { + put_links(header); + start_unregistering(header); + } + if (!--header->count) kfree_rcu(header, rcu); diff --git a/fs/proc/root.c b/fs/proc/root.c index 621e6ec322ca..8b145e7b9661 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -19,86 +19,178 @@ #include <linux/module.h> #include <linux/bitops.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pid_namespace.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/cred.h> +#include <linux/magic.h> +#include <linux/slab.h> #include "internal.h" -enum { - Opt_gid, Opt_hidepid, Opt_err, +struct proc_fs_context { + struct pid_namespace *pid_ns; + unsigned int mask; + int hidepid; + int gid; }; -static const match_table_t tokens = { - {Opt_hidepid, "hidepid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_err, NULL}, +enum proc_param { + Opt_gid, + Opt_hidepid, }; -int proc_parse_options(char *options, struct pid_namespace *pid) +static const struct fs_parameter_spec proc_param_specs[] = { + fsparam_u32("gid", Opt_gid), + fsparam_u32("hidepid", Opt_hidepid), + {} +}; + +static const struct fs_parameter_description proc_fs_parameters = { + .name = "proc", + .specs = proc_param_specs, +}; + +static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - switch (token) { - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - pid->pid_gid = make_kgid(current_user_ns(), option); - break; - case Opt_hidepid: - if (match_int(&args[0], &option)) - return 0; - if (option < HIDEPID_OFF || - option > HIDEPID_INVISIBLE) { - pr_err("proc: hidepid value must be between 0 and 2.\n"); - return 0; - } - pid->hide_pid = option; - break; - default: - pr_err("proc: unrecognized mount option \"%s\" " - "or missing value\n", p); - return 0; - } + struct proc_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, &proc_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_gid: + ctx->gid = result.uint_32; + break; + + case Opt_hidepid: + ctx->hidepid = result.uint_32; + if (ctx->hidepid < HIDEPID_OFF || + ctx->hidepid > HIDEPID_INVISIBLE) + return invalf(fc, "proc: hidepid value must be between 0 and 2.\n"); + break; + + default: + return -EINVAL; } - return 1; + ctx->mask |= 1 << opt; + return 0; } -int proc_remount(struct super_block *sb, int *flags, char *data) +static void proc_apply_options(struct super_block *s, + struct fs_context *fc, + struct pid_namespace *pid_ns, + struct user_namespace *user_ns) { + struct proc_fs_context *ctx = fc->fs_private; + + if (ctx->mask & (1 << Opt_gid)) + pid_ns->pid_gid = make_kgid(user_ns, ctx->gid); + if (ctx->mask & (1 << Opt_hidepid)) + pid_ns->hide_pid = ctx->hidepid; +} + +static int proc_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info); + struct inode *root_inode; + int ret; + + proc_apply_options(s, fc, pid_ns, current_user_ns()); + + /* User space would break if executables or devices appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; + s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = PROC_SUPER_MAGIC; + s->s_op = &proc_sops; + s->s_time_gran = 1; + + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + + /* procfs dentries and inodes don't require IO to create */ + s->s_shrink.seeks = 0; + + pde_get(&proc_root); + root_inode = proc_get_inode(s, &proc_root); + if (!root_inode) { + pr_err("proc_fill_super: get root inode failed\n"); + return -ENOMEM; + } + + s->s_root = d_make_root(root_inode); + if (!s->s_root) { + pr_err("proc_fill_super: allocate dentry failed\n"); + return -ENOMEM; + } + + ret = proc_setup_self(s); + if (ret) { + return ret; + } + return proc_setup_thread_self(s); +} + +static int proc_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; struct pid_namespace *pid = sb->s_fs_info; sync_filesystem(sb); - return !proc_parse_options(data, pid); + + proc_apply_options(sb, fc, pid, current_user_ns()); + return 0; } -static struct dentry *proc_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int proc_get_tree(struct fs_context *fc) { - struct pid_namespace *ns; + struct proc_fs_context *ctx = fc->fs_private; - if (flags & SB_KERNMOUNT) { - ns = data; - data = NULL; - } else { - ns = task_active_pid_ns(current); - } + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + fc->s_fs_info = ctx->pid_ns; + return vfs_get_super(fc, vfs_get_keyed_super, proc_fill_super); +} - return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super); +static void proc_fs_context_free(struct fs_context *fc) +{ + struct proc_fs_context *ctx = fc->fs_private; + + if (ctx->pid_ns) + put_pid_ns(ctx->pid_ns); + kfree(ctx); +} + +static const struct fs_context_operations proc_fs_context_ops = { + .free = proc_fs_context_free, + .parse_param = proc_parse_param, + .get_tree = proc_get_tree, + .reconfigure = proc_reconfigure, +}; + +static int proc_init_fs_context(struct fs_context *fc) +{ + struct proc_fs_context *ctx; + + ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); + fc->fs_private = ctx; + fc->ops = &proc_fs_context_ops; + return 0; } static void proc_kill_sb(struct super_block *sb) @@ -115,10 +207,11 @@ static void proc_kill_sb(struct super_block *sb) } static struct file_system_type proc_fs_type = { - .name = "proc", - .mount = proc_mount, - .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "proc", + .init_fs_context = proc_init_fs_context, + .parameters = &proc_fs_parameters, + .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) @@ -156,7 +249,7 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr { if (!proc_pid_lookup(dentry, flags)) return NULL; - + return proc_lookup(dir, dentry, flags); } @@ -209,9 +302,28 @@ struct proc_dir_entry proc_root = { int pid_ns_prepare_proc(struct pid_namespace *ns) { + struct proc_fs_context *ctx; + struct fs_context *fc; struct vfsmount *mnt; - mnt = kern_mount_data(&proc_fs_type, ns); + fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + if (fc->user_ns != ns->user_ns) { + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ns->user_ns); + } + + ctx = fc->fs_private; + if (ctx->pid_ns != ns) { + put_pid_ns(ctx->pid_ns); + get_pid_ns(ns); + ctx->pid_ns = ns; + } + + mnt = fc_mount(fc); + put_fs_context(fc); if (IS_ERR(mnt)) return PTR_ERR(mnt); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index beccb0b1d57c..95ca1fe7283c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); - SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); @@ -1143,6 +1143,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } + /* + * Avoid to modify vma->vm_flags + * without locked ops while the + * coredump reads the vm_flags. + */ + if (!mmget_still_valid(mm)) { + /* + * Silently return "count" + * like if get_task_mm() + * failed. FIXME: should this + * function have returned + * -ESRCH if get_task_mm() + * failed like if + * get_proc_task() fails? + */ + up_write(&mm->mmap_sem); + goto out_mm; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/read_write.c b/fs/read_write.c index 30df848b7451..61b43ad7608e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -478,8 +478,8 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t return ret; } -ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, - loff_t *pos) +static ssize_t __vfs_write(struct file *file, const char __user *p, + size_t count, loff_t *pos) { if (file->f_op->write) return file->f_op->write(file, p, count, pos); @@ -560,12 +560,13 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ static inline loff_t file_pos_read(struct file *file) { - return file->f_pos; + return file->f_mode & FMODE_STREAM ? 0 : file->f_pos; } static inline void file_pos_write(struct file *file, loff_t pos) { - file->f_pos = pos; + if ((file->f_mode & FMODE_STREAM) == 0) + file->f_pos = pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) @@ -1238,6 +1239,9 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { + if (pos == -1) + return do_compat_readv(fd, vec, vlen, flags); + return do_compat_preadv64(fd, vec, vlen, pos, flags); } #endif @@ -1344,6 +1348,9 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { + if (pos == -1) + return do_compat_writev(fd, vec, vlen, flags); + return do_compat_pwritev64(fd, vec, vlen, pos, flags); } #endif diff --git a/fs/splice.c b/fs/splice.c index 6489fb9436e4..25212dcca2df 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -138,7 +138,6 @@ error: } const struct pipe_buf_operations page_cache_pipe_buf_ops = { - .can_merge = 0, .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, @@ -156,7 +155,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations user_page_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, @@ -326,22 +324,20 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, EXPORT_SYMBOL(generic_file_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, }; -static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return 1; } /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_nosteal, @@ -1597,7 +1593,11 @@ retry: * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } *obuf = *ibuf; /* @@ -1606,6 +1606,8 @@ retry: */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + pipe_buf_mark_unmergeable(obuf); + obuf->len = len; opipe->nrbufs++; ibuf->offset += obuf->len; @@ -1669,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } obuf = opipe->bufs + nbuf; *obuf = *ibuf; @@ -1680,6 +1686,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + pipe_buf_mark_unmergeable(obuf); + if (obuf->len > len) obuf->len = len; diff --git a/fs/stat.c b/fs/stat.c index adbfcd86c81b..c38e4c2e1221 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -45,11 +45,6 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) stat->ctime = inode->i_ctime; stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; - - if (IS_NOATIME(inode)) - stat->result_mask &= ~STATX_ATIME; - if (IS_AUTOMOUNT(inode)) - stat->attributes |= STATX_ATTR_AUTOMOUNT; } EXPORT_SYMBOL(generic_fillattr); @@ -75,6 +70,13 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, stat->result_mask |= STATX_BASIC_STATS; request_mask &= STATX_ALL; query_flags &= KSTAT_QUERY_FLAGS; + + /* allow the fs to override these if it really wants to */ + if (IS_NOATIME(inode)) + stat->result_mask &= ~STATX_ATIME; + if (IS_AUTOMOUNT(inode)) + stat->attributes |= STATX_ATTR_AUTOMOUNT; + if (inode->i_op->getattr) return inode->i_op->getattr(path, stat, request_mask, query_flags); diff --git a/fs/super.c b/fs/super.c index 48e25eba8465..2739f57515f8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -35,6 +35,7 @@ #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" @@ -476,6 +477,94 @@ void generic_shutdown_super(struct super_block *sb) EXPORT_SYMBOL(generic_shutdown_super); /** + * sget_fc - Find or create a superblock + * @fc: Filesystem context. + * @test: Comparison callback + * @set: Setup callback + * + * Find or create a superblock using the parameters stored in the filesystem + * context and the two callback functions. + * + * If an extant superblock is matched, then that will be returned with an + * elevated reference count that the caller must transfer or discard. + * + * If no match is made, a new superblock will be allocated and basic + * initialisation will be performed (s_type, s_fs_info and s_id will be set and + * the set() callback will be invoked), the superblock will be published and it + * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE + * as yet unset. + */ +struct super_block *sget_fc(struct fs_context *fc, + int (*test)(struct super_block *, struct fs_context *), + int (*set)(struct super_block *, struct fs_context *)) +{ + struct super_block *s = NULL; + struct super_block *old; + struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; + int err; + + if (!(fc->sb_flags & SB_KERNMOUNT) && + fc->purpose != FS_CONTEXT_FOR_SUBMOUNT) { + /* Don't allow mounting unless the caller has CAP_SYS_ADMIN + * over the namespace. + */ + if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } else { + if (!ns_capable(fc->user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } + } + +retry: + spin_lock(&sb_lock); + if (test) { + hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { + if (test(old, fc)) + goto share_extant_sb; + } + } + if (!s) { + spin_unlock(&sb_lock); + s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); + if (!s) + return ERR_PTR(-ENOMEM); + goto retry; + } + + s->s_fs_info = fc->s_fs_info; + err = set(s, fc); + if (err) { + s->s_fs_info = NULL; + spin_unlock(&sb_lock); + destroy_unused_super(s); + return ERR_PTR(err); + } + fc->s_fs_info = NULL; + s->s_type = fc->fs_type; + strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id)); + list_add_tail(&s->s_list, &super_blocks); + hlist_add_head(&s->s_instances, &s->s_type->fs_supers); + spin_unlock(&sb_lock); + get_filesystem(s->s_type); + register_shrinker_prepared(&s->s_shrink); + return s; + +share_extant_sb: + if (user_ns != old->s_user_ns) { + spin_unlock(&sb_lock); + destroy_unused_super(s); + return ERR_PTR(-EBUSY); + } + if (!grab_super(old)) + goto retry; + destroy_unused_super(s); + return old; +} +EXPORT_SYMBOL(sget_fc); + +/** * sget_userns - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback @@ -835,28 +924,35 @@ rescan: } /** - * do_remount_sb - asks filesystem to change mount options. - * @sb: superblock in question - * @sb_flags: revised superblock flags - * @data: the rest of options - * @force: whether or not to force the change + * reconfigure_super - asks filesystem to change superblock parameters + * @fc: The superblock and configuration * - * Alters the mount options of a mounted file system. + * Alters the configuration parameters of a live superblock. */ -int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) +int reconfigure_super(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; int retval; - int remount_ro; + bool remount_ro = false; + bool force = fc->sb_flags & SB_FORCE; + if (fc->sb_flags_mask & ~MS_RMT_MASK) + return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; + retval = security_sb_remount(sb, fc->security); + if (retval) + return retval; + + if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK - if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev)) - return -EACCES; + if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev)) + return -EACCES; #endif - remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb); + remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); + } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { @@ -867,13 +963,14 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; - remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb); + remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); - /* If we are remounting RDONLY and current sb is read/write, - make sure there are no rw files opened */ + /* If we are reconfiguring to RDONLY and current sb is read/write, + * make sure there are no files open for writing. + */ if (remount_ro) { if (force) { sb->s_readonly_remount = 1; @@ -885,8 +982,8 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) } } - if (sb->s_op->remount_fs) { - retval = sb->s_op->remount_fs(sb, &sb_flags, data); + if (fc->ops->reconfigure) { + retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; @@ -895,7 +992,9 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) sb->s_type->name, retval); } } - sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK); + + WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | + (fc->sb_flags & fc->sb_flags_mask))); /* Needs to be ordered wrt mnt_is_readonly() */ smp_wmb(); sb->s_readonly_remount = 0; @@ -922,10 +1021,15 @@ static void do_emergency_remount_callback(struct super_block *sb) down_write(&sb->s_umount); if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && !sb_rdonly(sb)) { - /* - * What lock protects sb->s_flags?? - */ - do_remount_sb(sb, SB_RDONLY, NULL, 1); + struct fs_context *fc; + + fc = fs_context_for_reconfigure(sb->s_root, + SB_RDONLY | SB_FORCE, SB_RDONLY); + if (!IS_ERR(fc)) { + if (parse_monolithic_mount_data(fc, NULL) == 0) + (void)reconfigure_super(fc); + put_fs_context(fc); + } } up_write(&sb->s_umount); } @@ -1087,6 +1191,89 @@ struct dentry *mount_ns(struct file_system_type *fs_type, EXPORT_SYMBOL(mount_ns); +int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) +{ + return set_anon_super(sb, NULL); +} +EXPORT_SYMBOL(set_anon_super_fc); + +static int test_keyed_super(struct super_block *sb, struct fs_context *fc) +{ + return sb->s_fs_info == fc->s_fs_info; +} + +static int test_single_super(struct super_block *s, struct fs_context *fc) +{ + return 1; +} + +/** + * vfs_get_super - Get a superblock with a search key set in s_fs_info. + * @fc: The filesystem context holding the parameters + * @keying: How to distinguish superblocks + * @fill_super: Helper to initialise a new superblock + * + * Search for a superblock and create a new one if not found. The search + * criterion is controlled by @keying. If the search fails, a new superblock + * is created and @fill_super() is called to initialise it. + * + * @keying can take one of a number of values: + * + * (1) vfs_get_single_super - Only one superblock of this type may exist on the + * system. This is typically used for special system filesystems. + * + * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have + * distinct keys (where the key is in s_fs_info). Searching for the same + * key again will turn up the superblock for that key. + * + * (3) vfs_get_independent_super - Multiple superblocks may exist and are + * unkeyed. Each call will get a new superblock. + * + * A permissions check is made by sget_fc() unless we're getting a superblock + * for a kernel-internal mount or a submount. + */ +int vfs_get_super(struct fs_context *fc, + enum vfs_get_super_keying keying, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) +{ + int (*test)(struct super_block *, struct fs_context *); + struct super_block *sb; + + switch (keying) { + case vfs_get_single_super: + test = test_single_super; + break; + case vfs_get_keyed_super: + test = test_keyed_super; + break; + case vfs_get_independent_super: + test = NULL; + break; + default: + BUG(); + } + + sb = sget_fc(fc, test, set_anon_super_fc); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + int err = fill_super(sb, fc); + if (err) { + deactivate_locked_super(sb); + return err; + } + + sb->s_flags |= SB_ACTIVE; + } + + BUG_ON(fc->root); + fc->root = dget(sb->s_root); + return 0; +} +EXPORT_SYMBOL(vfs_get_super); + #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) { @@ -1212,6 +1399,31 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); +static int reconfigure_single(struct super_block *s, + int flags, void *data) +{ + struct fs_context *fc; + int ret; + + /* The caller really need to be passing fc down into mount_single(), + * then a chunk of this can be removed. [Bollocks -- AV] + * Better yet, reconfiguration shouldn't happen, but rather the second + * mount should be rejected if the parameters are not compatible. + */ + fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + ret = parse_monolithic_mount_data(fc, data); + if (ret < 0) + goto out; + + ret = reconfigure_super(fc); +out: + put_fs_context(fc); + return ret; +} + static int compare_single(struct super_block *s, void *p) { return 1; @@ -1229,41 +1441,59 @@ struct dentry *mount_single(struct file_system_type *fs_type, return ERR_CAST(s); if (!s->s_root) { error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - s->s_flags |= SB_ACTIVE; + if (!error) + s->s_flags |= SB_ACTIVE; } else { - do_remount_sb(s, flags, data, 0); + error = reconfigure_single(s, flags, data); + } + if (unlikely(error)) { + deactivate_locked_super(s); + return ERR_PTR(error); } return dget(s->s_root); } EXPORT_SYMBOL(mount_single); -struct dentry * -mount_fs(struct file_system_type *type, int flags, const char *name, void *data) +/** + * vfs_get_tree - Get the mountable root + * @fc: The superblock configuration context. + * + * The filesystem is invoked to get or create a superblock which can then later + * be used for mounting. The filesystem places a pointer to the root to be + * used for mounting in @fc->root. + */ +int vfs_get_tree(struct fs_context *fc) { - struct dentry *root; struct super_block *sb; - int error = -ENOMEM; - void *sec_opts = NULL; + int error; - if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { - error = security_sb_eat_lsm_opts(data, &sec_opts); - if (error) - return ERR_PTR(error); - } + if (fc->root) + return -EBUSY; - root = type->mount(type, flags, name, data); - if (IS_ERR(root)) { - error = PTR_ERR(root); - goto out_free_secdata; + /* Get the mountable root in fc->root, with a ref on the root and a ref + * on the superblock. + */ + error = fc->ops->get_tree(fc); + if (error < 0) + return error; + + if (!fc->root) { + pr_err("Filesystem %s get_tree() didn't set fc->root\n", + fc->fs_type->name); + /* We don't know what the locking state of the superblock is - + * if there is a superblock. + */ + BUG(); } - sb = root->d_sb; - BUG_ON(!sb); + + sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); + if (fc->subtype && !sb->s_subtype) { + sb->s_subtype = fc->subtype; + fc->subtype = NULL; + } + /* * Write barrier is for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is the @@ -1273,14 +1503,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) smp_wmb(); sb->s_flags |= SB_BORN; - error = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL); - if (error) - goto out_sb; - - if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT))) { - error = security_sb_kern_mount(sb); - if (error) - goto out_sb; + error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); + if (unlikely(error)) { + fc_drop_locked(fc); + return error; } /* @@ -1290,18 +1516,11 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " - "negative value (%lld)\n", type->name, sb->s_maxbytes); + "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); - up_write(&sb->s_umount); - security_free_mnt_opts(&sec_opts); - return root; -out_sb: - dput(root); - deactivate_locked_super(sb); -out_free_secdata: - security_free_mnt_opts(&sec_opts); - return ERR_PTR(error); + return 0; } +EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 92682fcc41f6..1b56686ab178 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -13,34 +13,71 @@ #include <linux/magic.h> #include <linux/mount.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> +#include <net/net_namespace.h> #include "sysfs.h" static struct kernfs_root *sysfs_root; struct kernfs_node *sysfs_root_kn; -static struct dentry *sysfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int sysfs_get_tree(struct fs_context *fc) { - struct dentry *root; - void *ns; - bool new_sb = false; + struct kernfs_fs_context *kfc = fc->fs_private; + int ret; - if (!(flags & SB_KERNMOUNT)) { + ret = kernfs_get_tree(fc); + if (ret) + return ret; + + if (kfc->new_sb_created) + fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; + return 0; +} + +static void sysfs_fs_context_free(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + if (kfc->ns_tag) + kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag); + kernfs_free_fs_context(fc); + kfree(kfc); +} + +static const struct fs_context_operations sysfs_fs_context_ops = { + .free = sysfs_fs_context_free, + .get_tree = sysfs_get_tree, +}; + +static int sysfs_init_fs_context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc; + struct net *netns; + + if (!(fc->sb_flags & SB_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) - return ERR_PTR(-EPERM); + return -EPERM; } - ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, - SYSFS_MAGIC, &new_sb, ns); - if (!new_sb) - kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); - else if (!IS_ERR(root)) - root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; + kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL); + if (!kfc) + return -ENOMEM; - return root; + kfc->ns_tag = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); + kfc->root = sysfs_root; + kfc->magic = SYSFS_MAGIC; + fc->fs_private = kfc; + fc->ops = &sysfs_fs_context_ops; + if (netns) { + if (fc->user_ns) + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(netns->user_ns); + } + fc->global = true; + return 0; } static void sysfs_kill_sb(struct super_block *sb) @@ -52,10 +89,10 @@ static void sysfs_kill_sb(struct super_block *sb) } static struct file_system_type sysfs_fs_type = { - .name = "sysfs", - .mount = sysfs_mount, - .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "sysfs", + .init_fs_context = sysfs_init_fs_context, + .kill_sb = sysfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index bc1e082d921d..9da2f135121b 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -8,6 +8,7 @@ config UBIFS_FS select CRYPTO_LZO if UBIFS_FS_LZO select CRYPTO_DEFLATE if UBIFS_FS_ZLIB select CRYPTO_HASH_INFO + select UBIFS_FS_XATTR if FS_ENCRYPTION depends on MTD_UBI help UBIFS is a file system for flash devices which works on top of UBI. @@ -60,17 +61,6 @@ config UBIFS_FS_XATTR If unsure, say Y. -config UBIFS_FS_ENCRYPTION - bool "UBIFS Encryption" - depends on UBIFS_FS_XATTR && BLOCK - select FS_ENCRYPTION - default n - help - Enable encryption of UBIFS files and directories. This - feature is similar to ecryptfs, but it is more memory - efficient since it avoids caching the encrypted and - decrypted pages in the page cache. - config UBIFS_FS_SECURITY bool "UBIFS Security Labels" depends on UBIFS_FS_XATTR diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index 5f838319c8d5..5c4b845754a7 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -6,6 +6,6 @@ ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o ubifs-y += misc.o -ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o +ubifs-$(CONFIG_FS_ENCRYPTION) += crypto.o ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 5bf5fd08879e..b758004085c4 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -33,7 +33,6 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node, int err; shash->tfm = c->hash_tfm; - shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_digest(shash, node, le32_to_cpu(ch->len), hash); if (err < 0) @@ -56,7 +55,6 @@ static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash, int err; shash->tfm = c->hmac_tfm; - shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_digest(shash, hash, c->hash_len, hmac); if (err < 0) @@ -88,7 +86,6 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node, return -ENOMEM; hash_desc->tfm = c->hash_tfm; - hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; ubifs_shash_copy_state(c, inhash, hash_desc); err = crypto_shash_final(hash_desc, hash); @@ -123,7 +120,6 @@ static struct shash_desc *ubifs_get_desc(const struct ubifs_info *c, return ERR_PTR(-ENOMEM); desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_init(desc); if (err) { @@ -364,7 +360,6 @@ static int ubifs_node_calc_hmac(const struct ubifs_info *c, const void *node, ubifs_assert(c, ofs_hmac + hmac_len < len); shash->tfm = c->hmac_tfm; - shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_init(shash); if (err) @@ -483,7 +478,6 @@ int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac) return 0; shash->tfm = c->hmac_tfm; - shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_init(shash); if (err) diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 0164bcc827f8..82e4e6a30b04 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -28,6 +28,11 @@ #include <linux/mount.h> #include "ubifs.h" +/* Need to be kept consistent with checked flags in ioctl2ubifs() */ +#define UBIFS_SUPPORTED_IOCTL_FLAGS \ + (FS_COMPR_FL | FS_SYNC_FL | FS_APPEND_FL | \ + FS_IMMUTABLE_FL | FS_DIRSYNC_FL) + /** * ubifs_set_inode_flags - set VFS inode flags. * @inode: VFS inode to set flags for @@ -169,6 +174,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; + if (flags & ~UBIFS_SUPPORTED_IOCTL_FLAGS) + return -EOPNOTSUPP; + if (!S_ISDIR(inode->i_mode)) flags &= ~FS_DIRSYNC_FL; @@ -185,7 +193,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } case FS_IOC_SET_ENCRYPTION_POLICY: { -#ifdef CONFIG_UBIFS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION struct ubifs_info *c = inode->i_sb->s_fs_info; err = ubifs_enable_encryption(c); @@ -198,7 +206,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) #endif } case FS_IOC_GET_ENCRYPTION_POLICY: { -#ifdef CONFIG_UBIFS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION return fscrypt_ioctl_get_policy(file, (void __user *)arg); #else return -EOPNOTSUPP; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 0a0e65c07c6d..5c8a81a019a4 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -576,7 +576,6 @@ static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_h SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); hash_desc->tfm = c->hash_tfm; - hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; ubifs_shash_copy_state(c, log_hash, hash_desc); return crypto_shash_final(hash_desc, hash); @@ -587,7 +586,6 @@ static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac) SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm); hmac_desc->tfm = c->hmac_tfm; - hmac_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac); } diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 3da90c951c23..67fac1e8adfb 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -748,7 +748,7 @@ int ubifs_read_superblock(struct ubifs_info *c) goto out; } -#ifndef CONFIG_UBIFS_FS_ENCRYPTION +#ifndef CONFIG_FS_ENCRYPTION if (c->encrypted) { ubifs_err(c, "file system contains encrypted files but UBIFS" " was built without crypto support."); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1fac1133dadd..12628184772c 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -276,14 +276,12 @@ static void ubifs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); struct ubifs_inode *ui = ubifs_inode(inode); + kfree(ui->data); kmem_cache_free(ubifs_inode_slab, ui); } static void ubifs_destroy_inode(struct inode *inode) { - struct ubifs_inode *ui = ubifs_inode(inode); - - kfree(ui->data); call_rcu(&inode->i_rcu, ubifs_i_callback); } @@ -2146,7 +2144,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_UBIFS_FS_XATTR sb->s_xattr = ubifs_xattr_handlers; #endif -#ifdef CONFIG_UBIFS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ubifs_crypt_operations; #endif diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 38401adaa00d..1ae12900e01d 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -43,7 +43,6 @@ #include <crypto/hash.h> #include <crypto/algapi.h> -#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION) #include <linux/fscrypt.h> #include "ubifs-media.h" @@ -142,7 +141,7 @@ */ #define WORST_COMPR_FACTOR 2 -#ifdef CONFIG_UBIFS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION #define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE #else #define UBIFS_CIPHER_BLOCK_SIZE 0 @@ -2072,7 +2071,7 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, #include "misc.h" #include "key.h" -#ifndef CONFIG_UBIFS_FS_ENCRYPTION +#ifndef CONFIG_FS_ENCRYPTION static inline int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index ae796e10f68b..e7276932e433 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1242,8 +1242,10 @@ set_size: truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); - udf_truncate_extents(inode); + err = udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); + if (err) + return err; } update_time: inode->i_mtime = inode->i_ctime = current_time(inode); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index b647f0bd150c..63a47f1e1d52 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -199,7 +199,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode, * for making file shorter. For making file longer, udf_extend_file() has to * be used. */ -void udf_truncate_extents(struct inode *inode) +int udf_truncate_extents(struct inode *inode) { struct extent_position epos; struct kernel_lb_addr eloc, neloc = {}; @@ -224,7 +224,7 @@ void udf_truncate_extents(struct inode *inode) if (etype == -1) { /* We should extend the file? */ WARN_ON(byte_offset); - return; + return 0; } epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); @@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode) epos.block = eloc; epos.bh = udf_tread(sb, udf_get_lb_pblock(sb, &eloc, 0)); + /* Error reading indirect block? */ + if (!epos.bh) + return -EIO; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> @@ -283,4 +286,5 @@ void udf_truncate_extents(struct inode *inode) iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); + return 0; } diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index ee246769dee4..d89ef71887fc 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -235,7 +235,7 @@ extern struct inode *udf_new_inode(struct inode *, umode_t); /* truncate.c */ extern void udf_truncate_tail_extent(struct inode *); extern void udf_discard_prealloc(struct inode *); -extern void udf_truncate_extents(struct inode *); +extern int udf_truncate_extents(struct inode *); /* balloc.c */ extern void udf_free_blocks(struct super_block *, struct inode *, diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 1fd3011ea623..7fd4802222b8 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -229,7 +229,7 @@ ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode) case UFS_UID_44BSD: return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid); case UFS_UID_EFT: - if (inode->ui_u1.oldids.ui_suid == 0xFFFF) + if (inode->ui_u1.oldids.ui_sgid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid); /* Fall through */ default: diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 89800fc7dc9d..f5de1e726356 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -629,6 +629,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ down_write(&mm->mmap_sem); + /* no task can run (and in turn coredump) yet */ + VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -883,6 +885,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_sem for writing. */ down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto skip_mm; prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -905,6 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } +skip_mm: up_write(&mm->mmap_sem); mmput(mm); wakeup: @@ -1333,6 +1338,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1520,6 +1527,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 48502cb9990f..4637ae1ae91c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1191,7 +1191,10 @@ xfs_iread_extents( * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. */ level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); + if (unlikely(level == 0)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); @@ -4249,9 +4252,13 @@ xfs_bmapi_write( struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap) /* i/o: mval size/count */ { + struct xfs_bmalloca bma = { + .tp = tp, + .ip = ip, + .total = total, + }; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; - struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ @@ -4319,10 +4326,6 @@ xfs_bmapi_write( eof = true; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; - bma.tp = tp; - bma.ip = ip; - bma.total = total; - bma.datatype = 0; bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); n = 0; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 9a3767818c50..9c2a0a13ed61 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -563,43 +563,40 @@ xfs_dir3_leaf_find_entry( */ int /* error */ xfs_dir2_leaf_addname( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) /* operation arguments */ { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_trans *tp = args->trans; __be16 *bestsp; /* freespace table in leaf */ - int compact; /* need to compact leaves */ - xfs_dir2_data_hdr_t *hdr; /* data block header */ + __be16 *tagp; /* end of data entry */ struct xfs_buf *dbp; /* data block buffer */ - xfs_dir2_data_entry_t *dep; /* data block entry */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* data unused entry */ + struct xfs_buf *lbp; /* leaf's buffer */ + struct xfs_dir2_leaf *leaf; /* leaf structure */ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + struct xfs_dir2_data_hdr *hdr; /* data block header */ + struct xfs_dir2_data_entry *dep; /* data block entry */ + struct xfs_dir2_leaf_entry *lep; /* leaf entry table pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir2_data_unused *dup; /* data unused entry */ + struct xfs_dir2_leaf_tail *ltp; /* leaf tail pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + int compact; /* need to compact leaves */ int error; /* error return value */ int grown; /* allocated new data block */ - int highstale; /* index of next stale leaf */ + int highstale = 0; /* index of next stale leaf */ int i; /* temporary, index */ int index; /* leaf table position */ - struct xfs_buf *lbp; /* leaf's buffer */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ int length; /* length of new entry */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ int lfloglow; /* low leaf logging index */ int lfloghigh; /* high leaf logging index */ - int lowstale; /* index of prev stale leaf */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ + int lowstale = 0; /* index of prev stale leaf */ int needbytes; /* leaf block bytes needed */ int needlog; /* need to log data header */ int needscan; /* need to rescan data free */ - __be16 *tagp; /* end of data entry */ - xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t use_block; /* data block number */ - struct xfs_dir2_data_free *bf; /* bestfree table */ - struct xfs_dir2_leaf_entry *ents; - struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_addname(args); - dp = args->dp; - tp = args->trans; - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 3b03703c5c3d..16731d2d684b 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -426,24 +426,22 @@ xfs_dir2_leaf_to_node( static int /* error */ xfs_dir2_leafn_add( struct xfs_buf *bp, /* leaf buffer */ - xfs_da_args_t *args, /* operation arguments */ + struct xfs_da_args *args, /* operation arguments */ int index) /* insertion pt for new entry */ { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_inode *dp = args->dp; + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *lep; + struct xfs_dir2_leaf_entry *ents; int compact; /* compacting stale leaves */ - xfs_inode_t *dp; /* incore directory inode */ - int highstale; /* next stale entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int highstale = 0; /* next stale entry */ int lfloghigh; /* high leaf entry logging */ int lfloglow; /* low leaf entry logging */ - int lowstale; /* previous stale entry */ - struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_dir2_leaf_entry *ents; + int lowstale = 0; /* previous stale entry */ trace_xfs_dir2_leafn_add(args, index); - dp = args->dp; - leaf = bp->b_addr; dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ents = dp->d_ops->leaf_ents_p(leaf); diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 6f94d1f7322d..117910db51b8 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -415,8 +415,17 @@ xchk_btree_check_owner( struct xfs_btree_cur *cur = bs->cur; struct check_owner *co; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL) + /* + * In theory, xfs_btree_get_block should only give us a null buffer + * pointer for the root of a root-in-inode btree type, but we need + * to check defensively here in case the cursor state is also screwed + * up. + */ + if (bp == NULL) { + if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; + } /* * We want to cross-reference each btree block with the bnobt diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index f1260b4bfdee..90527b094878 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -574,6 +574,11 @@ xchk_da_btree( /* Drill another level deeper. */ blkno = be32_to_cpu(key->before); level++; + if (level >= XFS_DA_NODE_MAXDEPTH) { + /* Too deep! */ + xchk_da_set_corrupt(&ds, level - 1); + break; + } ds.tree_level--; error = xchk_da_btree_block(&ds, level, blkno); if (error) diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 93f07edafd81..9ee2a7d02e70 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -161,6 +161,14 @@ xfs_ioc_trim( return -EPERM; if (!blk_queue_discard(q)) return -EOPNOTSUPP; + + /* + * We haven't recovered the log, so we cannot use our bnobt-guided + * storage zapping commands. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return -EROFS; + if (copy_from_user(&range, urange, sizeof(range))) return -EFAULT; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1f2e2845eb76..a7ceae90110e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -529,18 +529,17 @@ xfs_file_dio_aio_write( count = iov_iter_count(from); /* - * If we are doing unaligned IO, wait for all other IO to drain, - * otherwise demote the lock if we had to take the exclusive lock - * for other reasons in xfs_file_aio_write_checks. + * If we are doing unaligned IO, we can't allow any other overlapping IO + * in-flight at the same time or we risk data corruption. Wait for all + * other IO to drain before we submit. If the IO is aligned, demote the + * iolock if we had to take the exclusive lock in + * xfs_file_aio_write_checks() for other reasons. */ if (unaligned_io) { - /* If we are going to wait for other DIO to finish, bail */ - if (iocb->ki_flags & IOCB_NOWAIT) { - if (atomic_read(&inode->i_dio_count)) - return -EAGAIN; - } else { - inode_dio_wait(inode); - } + /* unaligned dio always waits, bail */ + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_dio_wait(inode); } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; @@ -548,6 +547,14 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); + + /* + * If unaligned, this is the only IO in-flight. If it has not yet + * completed, wait on it before we release the iolock to prevent + * subsequent overlapping IO. + */ + if (ret == -EIOCBQUEUED && unaligned_io) + inode_dio_wait(inode); out: xfs_iunlock(ip, iolock); |