summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h23
-rw-r--r--fs/9p/vfs_file.c6
-rw-r--r--fs/9p/vfs_inode.c23
-rw-r--r--fs/9p/vfs_inode_dotl.c27
-rw-r--r--fs/9p/vfs_super.c6
-rw-r--r--fs/Kconfig10
-rw-r--r--fs/Makefile3
-rw-r--r--fs/afs/callback.c3
-rw-r--r--fs/afs/cmservice.c2
-rw-r--r--fs/afs/fsclient.c6
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/afs/internal.h13
-rw-r--r--fs/afs/mntpt.c149
-rw-r--r--fs/afs/rxrpc.c30
-rw-r--r--fs/afs/server.c1
-rw-r--r--fs/afs/super.c432
-rw-r--r--fs/afs/volume.c4
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/afs/yfsclient.c2
-rw-r--r--fs/aio.c338
-rw-r--r--fs/block_dev.c23
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c76
-rw-r--r--fs/btrfs/extent_io.c4
-rw-r--r--fs/btrfs/file-item.c15
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/btrfs/ioctl.c31
-rw-r--r--fs/btrfs/ordered-data.c3
-rw-r--r--fs/btrfs/props.c8
-rw-r--r--fs/btrfs/qgroup.c17
-rw-r--r--fs/btrfs/raid56.c3
-rw-r--r--fs/btrfs/ref-verify.c15
-rw-r--r--fs/btrfs/root-tree.c8
-rw-r--r--fs/btrfs/transaction.c49
-rw-r--r--fs/btrfs/tree-log.c33
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/zstd.c6
-rw-r--r--fs/ceph/caps.c72
-rw-r--r--fs/ceph/debugfs.c27
-rw-r--r--fs/ceph/dir.c461
-rw-r--r--fs/ceph/file.c13
-rw-r--r--fs/ceph/inode.c70
-rw-r--r--fs/ceph/mds_client.c768
-rw-r--r--fs/ceph/mds_client.h43
-rw-r--r--fs/ceph/snap.c166
-rw-r--r--fs/ceph/super.c21
-rw-r--r--fs/ceph/super.h43
-rw-r--r--fs/ceph/xattr.c20
-rw-r--r--fs/cifs/Kconfig120
-rw-r--r--fs/cifs/cifs_debug.c11
-rw-r--r--fs/cifs/cifs_dfs_ref.c4
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_ioctl.h3
-rw-r--r--fs/cifs/cifsfs.c7
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h91
-rw-r--r--fs/cifs/cifsproto.h8
-rw-r--r--fs/cifs/cifssmb.c54
-rw-r--r--fs/cifs/connect.c100
-rw-r--r--fs/cifs/dir.c107
-rw-r--r--fs/cifs/file.c460
-rw-r--r--fs/cifs/inode.c6
-rw-r--r--fs/cifs/link.c14
-rw-r--r--fs/cifs/misc.c49
-rw-r--r--fs/cifs/smb1ops.c134
-rw-r--r--fs/cifs/smb2file.c6
-rw-r--r--fs/cifs/smb2inode.c87
-rw-r--r--fs/cifs/smb2maperror.c3
-rw-r--r--fs/cifs/smb2misc.c30
-rw-r--r--fs/cifs/smb2ops.c572
-rw-r--r--fs/cifs/smb2pdu.c388
-rw-r--r--fs/cifs/smb2pdu.h11
-rw-r--r--fs/cifs/smb2proto.h10
-rw-r--r--fs/cifs/smb2status.h6
-rw-r--r--fs/cifs/smb2transport.c25
-rw-r--r--fs/cifs/smbdirect.c6
-rw-r--r--fs/cifs/trace.h219
-rw-r--r--fs/cifs/transport.c302
-rw-r--r--fs/crypto/Kconfig6
-rw-r--r--fs/crypto/fscrypt_private.h1
-rw-r--r--fs/crypto/hooks.c6
-rw-r--r--fs/crypto/keyinfo.c1
-rw-r--r--fs/crypto/policy.c3
-rw-r--r--fs/dax.c40
-rw-r--r--fs/debugfs/inode.c13
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/ecryptfs/crypto.c1
-rw-r--r--fs/ecryptfs/keystore.c1
-rw-r--r--fs/exofs/BUGS3
-rw-r--r--fs/exofs/Kbuild20
-rw-r--r--fs/exofs/Kconfig13
-rw-r--r--fs/exofs/Kconfig.ore14
-rw-r--r--fs/exofs/common.h262
-rw-r--r--fs/exofs/dir.c661
-rw-r--r--fs/exofs/exofs.h240
-rw-r--r--fs/exofs/file.c83
-rw-r--r--fs/exofs/inode.c1514
-rw-r--r--fs/exofs/namei.c323
-rw-r--r--fs/exofs/ore.c1179
-rw-r--r--fs/exofs/ore_raid.c757
-rw-r--r--fs/exofs/ore_raid.h62
-rw-r--r--fs/exofs/super.c1071
-rw-r--r--fs/exofs/sys.c205
-rw-r--r--fs/ext4/Kconfig17
-rw-r--r--fs/ext4/dir.c10
-rw-r--r--fs/ext4/ext4.h22
-rw-r--r--fs/ext4/ext4_jbd2.h4
-rw-r--r--fs/ext4/extents.c33
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c49
-rw-r--r--fs/ext4/inode.c75
-rw-r--r--fs/ext4/ioctl.c112
-rw-r--r--fs/ext4/mballoc.c7
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c18
-rw-r--r--fs/ext4/page-io.c13
-rw-r--r--fs/ext4/readpage.c5
-rw-r--r--fs/ext4/resize.c20
-rw-r--r--fs/ext4/super.c23
-rw-r--r--fs/ext4/sysfs.c17
-rw-r--r--fs/ext4/xattr.c3
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/checkpoint.c20
-rw-r--r--fs/f2fs/data.c63
-rw-r--r--fs/f2fs/debug.c19
-rw-r--r--fs/f2fs/dir.c25
-rw-r--r--fs/f2fs/extent_cache.c2
-rw-r--r--fs/f2fs/f2fs.h92
-rw-r--r--fs/f2fs/file.c56
-rw-r--r--fs/f2fs/inline.c12
-rw-r--r--fs/f2fs/inode.c19
-rw-r--r--fs/f2fs/namei.c9
-rw-r--r--fs/f2fs/node.c6
-rw-r--r--fs/f2fs/segment.c80
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/super.c117
-rw-r--r--fs/f2fs/sysfs.c21
-rw-r--r--fs/f2fs/trace.c20
-rw-r--r--fs/f2fs/xattr.c25
-rw-r--r--fs/f2fs/xattr.h6
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/fs_context.c642
-rw-r--r--fs/fs_parser.c447
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/cuse.c7
-rw-r--r--fs/fuse/dev.c127
-rw-r--r--fs/fuse/dir.c54
-rw-r--r--fs/fuse/file.c342
-rw-r--r--fs/fuse/fuse_i.h28
-rw-r--r--fs/fuse/inode.c28
-rw-r--r--fs/fuse/readdir.c4
-rw-r--r--fs/gfs2/glock.c72
-rw-r--r--fs/gfs2/glock.h4
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/hpfs/hpfs.h8
-rw-r--r--fs/hugetlbfs/inode.c378
-rw-r--r--fs/inode.c9
-rw-r--r--fs/internal.h13
-rw-r--r--fs/io_uring.c790
-rw-r--r--fs/iomap.c12
-rw-r--r--fs/jbd2/checkpoint.c17
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/journal.c90
-rw-r--r--fs/jbd2/transaction.c83
-rw-r--r--fs/jffs2/readinode.c5
-rw-r--r--fs/jffs2/super.c5
-rw-r--r--fs/kernfs/kernfs-internal.h1
-rw-r--r--fs/kernfs/mount.c119
-rw-r--r--fs/lockd/clnt4xdr.c14
-rw-r--r--fs/lockd/clntxdr.c14
-rw-r--r--fs/lockd/host.c3
-rw-r--r--fs/locks.c5
-rw-r--r--fs/mount.h5
-rw-r--r--fs/namei.c5
-rw-r--r--fs/namespace.c395
-rw-r--r--fs/nfs/callback_xdr.c64
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/delegation.c22
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c98
-rw-r--r--fs/nfs/direct.c7
-rw-r--r--fs/nfs/file.c44
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c230
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h75
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c161
-rw-r--r--fs/nfs/inode.c33
-rw-r--r--fs/nfs/internal.h5
-rw-r--r--fs/nfs/io.c12
-rw-r--r--fs/nfs/namespace.c8
-rw-r--r--fs/nfs/nfs2xdr.c124
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3xdr.c209
-rw-r--r--fs/nfs/nfs42.h3
-rw-r--r--fs/nfs/nfs42proc.c167
-rw-r--r--fs/nfs/nfs42xdr.c130
-rw-r--r--fs/nfs/nfs4client.c33
-rw-r--r--fs/nfs/nfs4file.c4
-rw-r--r--fs/nfs/nfs4namespace.c5
-rw-r--r--fs/nfs/nfs4proc.c143
-rw-r--r--fs/nfs/nfs4session.c7
-rw-r--r--fs/nfs/nfs4session.h7
-rw-r--r--fs/nfs/nfs4state.c1
-rw-r--r--fs/nfs/nfs4trace.h25
-rw-r--r--fs/nfs/nfs4xdr.c530
-rw-r--r--fs/nfs/nfstrace.c1
-rw-r--r--fs/nfs/nfstrace.h85
-rw-r--r--fs/nfs/pagelist.c47
-rw-r--r--fs/nfs/pnfs.c35
-rw-r--r--fs/nfs/pnfs.h2
-rw-r--r--fs/nfs/pnfs_dev.c13
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c5
-rw-r--r--fs/nfs/unlink.c8
-rw-r--r--fs/nfs/write.c19
-rw-r--r--fs/nfsd/nfs3proc.c35
-rw-r--r--fs/nfsd/nfs3xdr.c14
-rw-r--r--fs/nfsd/nfs4callback.c25
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c20
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/notify/fanotify/fanotify.c14
-rw-r--r--fs/notify/fanotify/fanotify_user.c12
-rw-r--r--fs/notify/inotify/inotify_user.c7
-rw-r--r--fs/notify/mark.c12
-rw-r--r--fs/ocfs2/refcounttree.c42
-rw-r--r--fs/open.c24
-rw-r--r--fs/orangefs/inode.c7
-rw-r--r--fs/overlayfs/copy_up.c59
-rw-r--r--fs/overlayfs/overlayfs.h2
-rw-r--r--fs/overlayfs/util.c55
-rw-r--r--fs/pipe.c36
-rw-r--r--fs/pnode.c5
-rw-r--r--fs/pnode.h3
-rw-r--r--fs/proc/base.c105
-rw-r--r--fs/proc/inode.c52
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/kcore.c29
-rw-r--r--fs/proc/proc_sysctl.c7
-rw-r--r--fs/proc/root.c236
-rw-r--r--fs/proc/task_mmu.c20
-rw-r--r--fs/read_write.c15
-rw-r--r--fs/splice.c24
-rw-r--r--fs/stat.c12
-rw-r--r--fs/super.c341
-rw-r--r--fs/sysfs/mount.c75
-rw-r--r--fs/ubifs/Kconfig12
-rw-r--r--fs/ubifs/Makefile2
-rw-r--r--fs/ubifs/auth.c6
-rw-r--r--fs/ubifs/ioctl.c12
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/ubifs/super.c6
-rw-r--r--fs/ubifs/ubifs.h5
-rw-r--r--fs/udf/inode.c4
-rw-r--r--fs/udf/truncate.c8
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/ufs/util.h2
-rw-r--r--fs/userfaultfd.c9
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c15
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c37
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c18
-rw-r--r--fs/xfs/scrub/btree.c11
-rw-r--r--fs/xfs/scrub/dabtree.c5
-rw-r--r--fs/xfs/xfs_discard.c8
-rw-r--r--fs/xfs/xfs_file.c27
272 files changed, 9777 insertions, 11282 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 5a0db6dec8d1..aaee1e6584e6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,6 +40,9 @@
*/
#define P9_LOCK_TIMEOUT (30*HZ)
+/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */
+#define V9FS_STAT2INODE_KEEP_ISIZE 1
+
extern struct file_system_type v9fs_fs_type;
extern const struct address_space_operations v9fs_addr_operations;
extern const struct file_operations v9fs_file_operations;
@@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t);
void v9fs_evict_inode(struct inode *inode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
-void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
-void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
+void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
+ struct super_block *sb, unsigned int flags);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+ unsigned int flags);
int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
@@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
}
int v9fs_open_to_dotl_flags(int flags);
+
+static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size)
+{
+ /*
+ * 32-bit need the lock, concurrent updates could break the
+ * sequences and make i_size_read() loop forever.
+ * 64-bit updates are atomic and can skip the locking.
+ */
+ if (sizeof(i_size) > sizeof(long))
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, i_size);
+ if (sizeof(i_size) > sizeof(long))
+ spin_unlock(&inode->i_lock);
+}
#endif
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a25efa782fcc..9a1125305d84 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -446,7 +446,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
i_size = i_size_read(inode);
if (iocb->ki_pos > i_size) {
inode_add_bytes(inode, iocb->ki_pos - i_size);
- i_size_write(inode, iocb->ki_pos);
+ /*
+ * Need to serialize against i_size_write() in
+ * v9fs_stat2inode()
+ */
+ v9fs_i_size_write(inode, iocb->ki_pos);
}
return retval;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 85ff859d3af5..72b779bc0942 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
if (retval)
goto error;
- v9fs_stat2inode(st, inode, sb);
+ v9fs_stat2inode(st, inode, sb, 0);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
return inode;
@@ -1092,7 +1092,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
if (IS_ERR(st))
return PTR_ERR(st);
- v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb);
+ v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
generic_fillattr(d_inode(dentry), stat);
p9stat_free(st);
@@ -1170,12 +1170,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
* @stat: Plan 9 metadata (mistat) structure
* @inode: inode to populate
* @sb: superblock of filesystem
+ * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
*
*/
void
v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
- struct super_block *sb)
+ struct super_block *sb, unsigned int flags)
{
umode_t mode;
char ext[32];
@@ -1216,10 +1217,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
mode = p9mode2perm(v9ses, stat);
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
- i_size_write(inode, stat->length);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+ v9fs_i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
- inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+ inode->i_blocks = (stat->length + 512 - 1) >> 9;
v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
}
@@ -1416,9 +1418,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
{
int umode;
dev_t rdev;
- loff_t i_size;
struct p9_wstat *st;
struct v9fs_session_info *v9ses;
+ unsigned int flags;
v9ses = v9fs_inode2v9ses(inode);
st = p9_client_stat(fid);
@@ -1431,16 +1433,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
goto out;
- spin_lock(&inode->i_lock);
/*
* We don't want to refresh inode->i_size,
* because we may have cached data
*/
- i_size = inode->i_size;
- v9fs_stat2inode(st, inode, inode->i_sb);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- inode->i_size = i_size;
- spin_unlock(&inode->i_lock);
+ flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+ V9FS_STAT2INODE_KEEP_ISIZE : 0;
+ v9fs_stat2inode(st, inode, inode->i_sb, flags);
out:
p9stat_free(st);
kfree(st);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4823e1c46999..a950a927a626 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
if (retval)
goto error;
- v9fs_stat2inode_dotl(st, inode);
+ v9fs_stat2inode_dotl(st, inode, 0);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
if (retval)
@@ -496,7 +496,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
if (IS_ERR(st))
return PTR_ERR(st);
- v9fs_stat2inode_dotl(st, d_inode(dentry));
+ v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
generic_fillattr(d_inode(dentry), stat);
/* Change block size to what the server returned */
stat->blksize = st->st_blksize;
@@ -607,11 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
* v9fs_stat2inode_dotl - populate an inode structure with stat info
* @stat: stat structure
* @inode: inode to populate
+ * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
*
*/
void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+ unsigned int flags)
{
umode_t mode;
struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -631,7 +633,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
- i_size_write(inode, stat->st_size);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+ v9fs_i_size_write(inode, stat->st_size);
inode->i_blocks = stat->st_blocks;
} else {
if (stat->st_result_mask & P9_STATS_ATIME) {
@@ -661,8 +664,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
}
if (stat->st_result_mask & P9_STATS_RDEV)
inode->i_rdev = new_decode_dev(stat->st_rdev);
- if (stat->st_result_mask & P9_STATS_SIZE)
- i_size_write(inode, stat->st_size);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
+ stat->st_result_mask & P9_STATS_SIZE)
+ v9fs_i_size_write(inode, stat->st_size);
if (stat->st_result_mask & P9_STATS_BLOCKS)
inode->i_blocks = stat->st_blocks;
}
@@ -928,9 +932,9 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry,
int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
{
- loff_t i_size;
struct p9_stat_dotl *st;
struct v9fs_session_info *v9ses;
+ unsigned int flags;
v9ses = v9fs_inode2v9ses(inode);
st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
@@ -942,16 +946,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
goto out;
- spin_lock(&inode->i_lock);
/*
* We don't want to refresh inode->i_size,
* because we may have cached data
*/
- i_size = inode->i_size;
- v9fs_stat2inode_dotl(st, inode);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- inode->i_size = i_size;
- spin_unlock(&inode->i_lock);
+ flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+ V9FS_STAT2INODE_KEEP_ISIZE : 0;
+ v9fs_stat2inode_dotl(st, inode, flags);
out:
kfree(st);
return 0;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 48ce50484e80..d13d35cf69c0 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -92,7 +92,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
return ret;
if (v9ses->cache)
- sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
@@ -172,7 +172,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
goto release_sb;
}
d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode_dotl(st, d_inode(root));
+ v9fs_stat2inode_dotl(st, d_inode(root), 0);
kfree(st);
} else {
struct p9_wstat *st = NULL;
@@ -183,7 +183,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
}
d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode(st, d_inode(root), sb);
+ v9fs_stat2inode(st, d_inode(root), sb, 0);
p9stat_free(st);
kfree(st);
diff --git a/fs/Kconfig b/fs/Kconfig
index ac474a61be37..3e6d3101f3ff 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -8,6 +8,13 @@ menu "File systems"
config DCACHE_WORD_ACCESS
bool
+config VALIDATE_FS_PARSER
+ bool "Validate filesystem parameter description"
+ default y
+ help
+ Enable this to perform validation of the parameter description for a
+ filesystem when it is registered.
+
if BLOCK
config FS_IOMAP
@@ -254,12 +261,9 @@ source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
-source "fs/exofs/Kconfig"
endif # MISC_FILESYSTEMS
-source "fs/exofs/Kconfig.ore"
-
menuconfig NETWORK_FILESYSTEMS
bool "Network File Systems"
default y
diff --git a/fs/Makefile b/fs/Makefile
index ffeaa6632ab4..427fec226fae 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,7 @@ obj-y := open.o read_write.o file_table.o super.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
- fs_types.o
+ fs_types.o fs_context.o fs_parser.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
@@ -126,7 +126,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_F2FS_FS) += f2fs/
-obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 1c7955f5cdaf..128f2dbe256a 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -203,8 +203,7 @@ void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi)
*/
void afs_init_callback_state(struct afs_server *server)
{
- if (!test_and_clear_bit(AFS_SERVER_FL_NEW, &server->flags))
- server->cb_s_break++;
+ server->cb_s_break++;
}
/*
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 8ee5972893ed..2f8acb4c556d 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -34,7 +34,7 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *);
static int afs_deliver_yfs_cb_callback(struct afs_call *);
#define CM_NAME(name) \
- const char afs_SRXCB##name##_name[] __tracepoint_string = \
+ char afs_SRXCB##name##_name[] __tracepoint_string = \
"CB." #name
/*
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index ca08c83168f5..0b37867b5c20 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1515,8 +1515,8 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
xdr_encode_AFS_StoreStatus(&bp, attr);
- *bp++ = 0; /* position of start of write */
- *bp++ = 0;
+ *bp++ = htonl(attr->ia_size >> 32); /* position of start of write */
+ *bp++ = htonl((u32) attr->ia_size);
*bp++ = 0; /* size of write */
*bp++ = 0;
*bp++ = htonl(attr->ia_size >> 32); /* new file length */
@@ -1564,7 +1564,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
xdr_encode_AFS_StoreStatus(&bp, attr);
- *bp++ = 0; /* position of start of write */
+ *bp++ = htonl(attr->ia_size); /* position of start of write */
*bp++ = 0; /* size of write */
*bp++ = htonl(attr->ia_size); /* new file length */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1a4ce07fb406..9cedc3fc1b77 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -216,9 +216,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
set_nlink(inode, 2);
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
- inode->i_ctime.tv_sec = get_seconds();
- inode->i_ctime.tv_nsec = 0;
- inode->i_atime = inode->i_mtime = inode->i_ctime;
+ inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode);
inode->i_blocks = 0;
inode_set_iversion_raw(inode, 0);
inode->i_generation = 0;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8871b9e8645f..3904ab0b9563 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -36,15 +36,14 @@
struct pagevec;
struct afs_call;
-struct afs_mount_params {
- bool rwpath; /* T if the parent should be considered R/W */
+struct afs_fs_context {
bool force; /* T to force cell type */
bool autocell; /* T if set auto mount operation */
bool dyn_root; /* T if dynamic root */
+ bool no_cell; /* T if the source is "none" (for dynroot) */
afs_voltype_t type; /* type of volume requested */
- int volnamesz; /* size of volume name */
+ unsigned int volnamesz; /* size of volume name */
const char *volname; /* name of volume to mount */
- struct net *net_ns; /* Network namespace in effect */
struct afs_net *net; /* the AFS net namespace stuff */
struct afs_cell *cell; /* cell in which to find volume */
struct afs_volume *volume; /* volume record */
@@ -475,7 +474,6 @@ struct afs_server {
time64_t put_time; /* Time at which last put */
time64_t update_at; /* Time at which to next update the record */
unsigned long flags;
-#define AFS_SERVER_FL_NEW 0 /* New server, don't inc cb_s_break */
#define AFS_SERVER_FL_NOT_READY 1 /* The record is not ready for use */
#define AFS_SERVER_FL_NOT_FOUND 2 /* VL server says no such server */
#define AFS_SERVER_FL_VL_FAIL 3 /* Failed to access VL server */
@@ -828,7 +826,7 @@ static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest
static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
{
- return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break;
+ return vnode->cb_break + vnode->cb_v_break;
}
static inline bool afs_cb_is_broken(unsigned int cb_break,
@@ -836,7 +834,6 @@ static inline bool afs_cb_is_broken(unsigned int cb_break,
const struct afs_cb_interest *cbi)
{
return !cbi || cb_break != (vnode->cb_break +
- cbi->server->cb_s_break +
vnode->volume->cb_v_break);
}
@@ -1274,7 +1271,7 @@ static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume)
return volume;
}
-extern struct afs_volume *afs_create_volume(struct afs_mount_params *);
+extern struct afs_volume *afs_create_volume(struct afs_fs_context *);
extern void afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
extern void afs_put_volume(struct afs_cell *, struct afs_volume *);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2e51c6994148..eecd8b699186 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -17,6 +17,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/gfp.h>
+#include <linux/fs_context.h>
#include "internal.h"
@@ -47,6 +48,8 @@ static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
+static const char afs_root_volume[] = "root.cell";
+
/*
* no valid lookup procedure on this sort of dir
*/
@@ -68,108 +71,112 @@ static int afs_mntpt_open(struct inode *inode, struct file *file)
}
/*
- * create a vfsmount to be automounted
+ * Set the parameters for the proposed superblock.
*/
-static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
+static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
{
- struct afs_super_info *as;
- struct vfsmount *mnt;
- struct afs_vnode *vnode;
- struct page *page;
- char *devname, *options;
- bool rwpath = false;
+ struct afs_fs_context *ctx = fc->fs_private;
+ struct afs_super_info *src_as = AFS_FS_S(mntpt->d_sb);
+ struct afs_vnode *vnode = AFS_FS_I(d_inode(mntpt));
+ struct afs_cell *cell;
+ const char *p;
int ret;
- _enter("{%pd}", mntpt);
-
- BUG_ON(!d_inode(mntpt));
-
- ret = -ENOMEM;
- devname = (char *) get_zeroed_page(GFP_KERNEL);
- if (!devname)
- goto error_no_devname;
-
- options = (char *) get_zeroed_page(GFP_KERNEL);
- if (!options)
- goto error_no_options;
+ if (fc->net_ns != src_as->net_ns) {
+ put_net(fc->net_ns);
+ fc->net_ns = get_net(src_as->net_ns);
+ }
- vnode = AFS_FS_I(d_inode(mntpt));
+ if (src_as->volume && src_as->volume->type == AFSVL_RWVOL) {
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
+ }
+ if (ctx->cell) {
+ afs_put_cell(ctx->net, ctx->cell);
+ ctx->cell = NULL;
+ }
if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
/* if the directory is a pseudo directory, use the d_name */
- static const char afs_root_cell[] = ":root.cell.";
unsigned size = mntpt->d_name.len;
- ret = -ENOENT;
- if (size < 2 || size > AFS_MAXCELLNAME)
- goto error_no_page;
+ if (size < 2)
+ return -ENOENT;
+ p = mntpt->d_name.name;
if (mntpt->d_name.name[0] == '.') {
- devname[0] = '%';
- memcpy(devname + 1, mntpt->d_name.name + 1, size - 1);
- memcpy(devname + size, afs_root_cell,
- sizeof(afs_root_cell));
- rwpath = true;
- } else {
- devname[0] = '#';
- memcpy(devname + 1, mntpt->d_name.name, size);
- memcpy(devname + size + 1, afs_root_cell,
- sizeof(afs_root_cell));
+ size--;
+ p++;
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
}
+ if (size > AFS_MAXCELLNAME)
+ return -ENAMETOOLONG;
+
+ cell = afs_lookup_cell(ctx->net, p, size, NULL, false);
+ if (IS_ERR(cell)) {
+ pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
+ return PTR_ERR(cell);
+ }
+ ctx->cell = cell;
+
+ ctx->volname = afs_root_volume;
+ ctx->volnamesz = sizeof(afs_root_volume) - 1;
} else {
/* read the contents of the AFS special symlink */
+ struct page *page;
loff_t size = i_size_read(d_inode(mntpt));
char *buf;
- ret = -EINVAL;
+ if (src_as->cell)
+ ctx->cell = afs_get_cell(src_as->cell);
+
if (size > PAGE_SIZE - 1)
- goto error_no_page;
+ return -EINVAL;
page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
- goto error_no_page;
- }
+ if (IS_ERR(page))
+ return PTR_ERR(page);
if (PageError(page)) {
ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt);
- goto error;
+ put_page(page);
+ return ret;
}
- buf = kmap_atomic(page);
- memcpy(devname, buf, size);
- kunmap_atomic(buf);
+ buf = kmap(page);
+ ret = vfs_parse_fs_string(fc, "source", buf, size);
+ kunmap(page);
put_page(page);
- page = NULL;
+ if (ret < 0)
+ return ret;
}
- /* work out what options we want */
- as = AFS_FS_S(mntpt->d_sb);
- if (as->cell) {
- memcpy(options, "cell=", 5);
- strcpy(options + 5, as->cell->name);
- if ((as->volume && as->volume->type == AFSVL_RWVOL) || rwpath)
- strcat(options, ",rwpath");
- }
+ return 0;
+}
- /* try and do the mount */
- _debug("--- attempting mount %s -o %s ---", devname, options);
- mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
- _debug("--- mount result %p ---", mnt);
+/*
+ * create a vfsmount to be automounted
+ */
+static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
+{
+ struct fs_context *fc;
+ struct vfsmount *mnt;
+ int ret;
- free_page((unsigned long) devname);
- free_page((unsigned long) options);
- _leave(" = %p", mnt);
- return mnt;
+ BUG_ON(!d_inode(mntpt));
-error:
- put_page(page);
-error_no_page:
- free_page((unsigned long) options);
-error_no_options:
- free_page((unsigned long) devname);
-error_no_devname:
- _leave(" = %d", ret);
- return ERR_PTR(ret);
+ fc = fs_context_for_submount(&afs_fs_type, mntpt);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ ret = afs_mntpt_set_params(fc, mntpt);
+ if (!ret)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(ret);
+
+ put_fs_context(fc);
+ return mnt;
}
/*
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 2c588f9bbbda..15c7e82d80cb 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -572,13 +572,17 @@ static void afs_deliver_to_call(struct afs_call *call)
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
- default:
abort_code = RXGEN_CC_UNMARSHAL;
if (state != AFS_CALL_CL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret, "KUM");
goto local_abort;
+ default:
+ abort_code = RX_USER_ABORT;
+ rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+ abort_code, ret, "KER");
+ goto local_abort;
}
}
@@ -610,6 +614,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
bool stalled = false;
u64 rtt;
u32 life, last_life;
+ bool rxrpc_complete = false;
DECLARE_WAITQUEUE(myself, current);
@@ -621,7 +626,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
rtt2 = 2;
timeout = rtt2;
- last_life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
+ rxrpc_kernel_check_life(call->net->socket, call->rxcall, &last_life);
add_wait_queue(&call->waitq, &myself);
for (;;) {
@@ -639,7 +644,12 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
if (afs_check_call_state(call, AFS_CALL_COMPLETE))
break;
- life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
+ if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall, &life)) {
+ /* rxrpc terminated the call. */
+ rxrpc_complete = true;
+ break;
+ }
+
if (timeout == 0 &&
life == last_life && signal_pending(current)) {
if (stalled)
@@ -663,12 +673,16 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
remove_wait_queue(&call->waitq, &myself);
__set_current_state(TASK_RUNNING);
- /* Kill off the call if it's still live. */
if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
- _debug("call interrupted");
- if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- RX_USER_ABORT, -EINTR, "KWI"))
- afs_set_call_complete(call, -EINTR, 0);
+ if (rxrpc_complete) {
+ afs_set_call_complete(call, call->error, call->abort_code);
+ } else {
+ /* Kill off the call if it's still live. */
+ _debug("call interrupted");
+ if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+ RX_USER_ABORT, -EINTR, "KWI"))
+ afs_set_call_complete(call, -EINTR, 0);
+ }
}
spin_lock_bh(&call->state_lock);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 642afa2e9783..65b33b6da48b 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -226,7 +226,6 @@ static struct afs_server *afs_alloc_server(struct afs_net *net,
RCU_INIT_POINTER(server->addresses, alist);
server->addr_version = alist->version;
server->uuid = *uuid;
- server->flags = (1UL << AFS_SERVER_FL_NEW);
server->update_at = ktime_get_real_seconds() + afs_server_update_delay;
rwlock_init(&server->fs_lock);
INIT_HLIST_HEAD(&server->cb_volumes);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index dcd07fe99871..5adf012b8e27 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -1,6 +1,6 @@
/* AFS superblock handling
*
- * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2002, 2007, 2018 Red Hat, Inc. All rights reserved.
*
* This software may be freely redistributed under the terms of the
* GNU General Public License.
@@ -21,7 +21,7 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/statfs.h>
#include <linux/sched.h>
#include <linux/nsproxy.h>
@@ -30,21 +30,22 @@
#include "internal.h"
static void afs_i_init_once(void *foo);
-static struct dentry *afs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data);
static void afs_kill_super(struct super_block *sb);
static struct inode *afs_alloc_inode(struct super_block *sb);
static void afs_destroy_inode(struct inode *inode);
static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
static int afs_show_devname(struct seq_file *m, struct dentry *root);
static int afs_show_options(struct seq_file *m, struct dentry *root);
+static int afs_init_fs_context(struct fs_context *fc);
+static const struct fs_parameter_description afs_fs_parameters;
struct file_system_type afs_fs_type = {
- .owner = THIS_MODULE,
- .name = "afs",
- .mount = afs_mount,
- .kill_sb = afs_kill_super,
- .fs_flags = 0,
+ .owner = THIS_MODULE,
+ .name = "afs",
+ .init_fs_context = afs_init_fs_context,
+ .parameters = &afs_fs_parameters,
+ .kill_sb = afs_kill_super,
+ .fs_flags = 0,
};
MODULE_ALIAS_FS("afs");
@@ -63,22 +64,22 @@ static const struct super_operations afs_super_ops = {
static struct kmem_cache *afs_inode_cachep;
static atomic_t afs_count_active_inodes;
-enum {
- afs_no_opt,
- afs_opt_cell,
- afs_opt_dyn,
- afs_opt_rwpath,
- afs_opt_vol,
- afs_opt_autocell,
+enum afs_param {
+ Opt_autocell,
+ Opt_dyn,
+ Opt_source,
};
-static const match_table_t afs_options_list = {
- { afs_opt_cell, "cell=%s" },
- { afs_opt_dyn, "dyn" },
- { afs_opt_rwpath, "rwpath" },
- { afs_opt_vol, "vol=%s" },
- { afs_opt_autocell, "autocell" },
- { afs_no_opt, NULL },
+static const struct fs_parameter_spec afs_param_specs[] = {
+ fsparam_flag ("autocell", Opt_autocell),
+ fsparam_flag ("dyn", Opt_dyn),
+ fsparam_string("source", Opt_source),
+ {}
+};
+
+static const struct fs_parameter_description afs_fs_parameters = {
+ .name = "kAFS",
+ .specs = afs_param_specs,
};
/*
@@ -190,84 +191,23 @@ static int afs_show_options(struct seq_file *m, struct dentry *root)
}
/*
- * parse the mount options
- * - this function has been shamelessly adapted from the ext3 fs which
- * shamelessly adapted it from the msdos fs
- */
-static int afs_parse_options(struct afs_mount_params *params,
- char *options, const char **devname)
-{
- struct afs_cell *cell;
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int token;
-
- _enter("%s", options);
-
- options[PAGE_SIZE - 1] = 0;
-
- while ((p = strsep(&options, ","))) {
- if (!*p)
- continue;
-
- token = match_token(p, afs_options_list, args);
- switch (token) {
- case afs_opt_cell:
- rcu_read_lock();
- cell = afs_lookup_cell_rcu(params->net,
- args[0].from,
- args[0].to - args[0].from);
- rcu_read_unlock();
- if (IS_ERR(cell))
- return PTR_ERR(cell);
- afs_put_cell(params->net, params->cell);
- params->cell = cell;
- break;
-
- case afs_opt_rwpath:
- params->rwpath = true;
- break;
-
- case afs_opt_vol:
- *devname = args[0].from;
- break;
-
- case afs_opt_autocell:
- params->autocell = true;
- break;
-
- case afs_opt_dyn:
- params->dyn_root = true;
- break;
-
- default:
- printk(KERN_ERR "kAFS:"
- " Unknown or invalid mount option: '%s'\n", p);
- return -EINVAL;
- }
- }
-
- _leave(" = 0");
- return 0;
-}
-
-/*
- * parse a device name to get cell name, volume name, volume type and R/W
- * selector
- * - this can be one of the following:
+ * Parse the source name to get cell name, volume name, volume type and R/W
+ * selector.
+ *
+ * This can be one of the following:
* "%[cell:]volume[.]" R/W volume
- * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0),
- * or R/W (rwpath=1) volume
+ * "#[cell:]volume[.]" R/O or R/W volume (R/O parent),
+ * or R/W (R/W parent) volume
* "%[cell:]volume.readonly" R/O volume
* "#[cell:]volume.readonly" R/O volume
* "%[cell:]volume.backup" Backup volume
* "#[cell:]volume.backup" Backup volume
*/
-static int afs_parse_device_name(struct afs_mount_params *params,
- const char *name)
+static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
{
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_cell *cell;
- const char *cellname, *suffix;
+ const char *cellname, *suffix, *name = param->string;
int cellnamesz;
_enter(",%s", name);
@@ -278,69 +218,149 @@ static int afs_parse_device_name(struct afs_mount_params *params,
}
if ((name[0] != '%' && name[0] != '#') || !name[1]) {
+ /* To use dynroot, we don't want to have to provide a source */
+ if (strcmp(name, "none") == 0) {
+ ctx->no_cell = true;
+ return 0;
+ }
printk(KERN_ERR "kAFS: unparsable volume name\n");
return -EINVAL;
}
/* determine the type of volume we're looking for */
- params->type = AFSVL_ROVOL;
- params->force = false;
- if (params->rwpath || name[0] == '%') {
- params->type = AFSVL_RWVOL;
- params->force = true;
+ if (name[0] == '%') {
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
}
name++;
/* split the cell name out if there is one */
- params->volname = strchr(name, ':');
- if (params->volname) {
+ ctx->volname = strchr(name, ':');
+ if (ctx->volname) {
cellname = name;
- cellnamesz = params->volname - name;
- params->volname++;
+ cellnamesz = ctx->volname - name;
+ ctx->volname++;
} else {
- params->volname = name;
+ ctx->volname = name;
cellname = NULL;
cellnamesz = 0;
}
/* the volume type is further affected by a possible suffix */
- suffix = strrchr(params->volname, '.');
+ suffix = strrchr(ctx->volname, '.');
if (suffix) {
if (strcmp(suffix, ".readonly") == 0) {
- params->type = AFSVL_ROVOL;
- params->force = true;
+ ctx->type = AFSVL_ROVOL;
+ ctx->force = true;
} else if (strcmp(suffix, ".backup") == 0) {
- params->type = AFSVL_BACKVOL;
- params->force = true;
+ ctx->type = AFSVL_BACKVOL;
+ ctx->force = true;
} else if (suffix[1] == 0) {
} else {
suffix = NULL;
}
}
- params->volnamesz = suffix ?
- suffix - params->volname : strlen(params->volname);
+ ctx->volnamesz = suffix ?
+ suffix - ctx->volname : strlen(ctx->volname);
_debug("cell %*.*s [%p]",
- cellnamesz, cellnamesz, cellname ?: "", params->cell);
+ cellnamesz, cellnamesz, cellname ?: "", ctx->cell);
/* lookup the cell record */
- if (cellname || !params->cell) {
- cell = afs_lookup_cell(params->net, cellname, cellnamesz,
+ if (cellname) {
+ cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
NULL, false);
if (IS_ERR(cell)) {
- printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
+ pr_err("kAFS: unable to lookup cell '%*.*s'\n",
cellnamesz, cellnamesz, cellname ?: "");
return PTR_ERR(cell);
}
- afs_put_cell(params->net, params->cell);
- params->cell = cell;
+ afs_put_cell(ctx->net, ctx->cell);
+ ctx->cell = cell;
}
_debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
- params->cell->name, params->cell,
- params->volnamesz, params->volnamesz, params->volname,
- suffix ?: "-", params->type, params->force ? " FORCE" : "");
+ ctx->cell->name, ctx->cell,
+ ctx->volnamesz, ctx->volnamesz, ctx->volname,
+ suffix ?: "-", ctx->type, ctx->force ? " FORCE" : "");
+
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+}
+
+/*
+ * Parse a single mount parameter.
+ */
+static int afs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ struct afs_fs_context *ctx = fc->fs_private;
+ int opt;
+
+ opt = fs_parse(fc, &afs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_source:
+ return afs_parse_source(fc, param);
+
+ case Opt_autocell:
+ ctx->autocell = true;
+ break;
+
+ case Opt_dyn:
+ ctx->dyn_root = true;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Validate the options, get the cell key and look up the volume.
+ */
+static int afs_validate_fc(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx = fc->fs_private;
+ struct afs_volume *volume;
+ struct key *key;
+
+ if (!ctx->dyn_root) {
+ if (ctx->no_cell) {
+ pr_warn("kAFS: Can only specify source 'none' with -o dyn\n");
+ return -EINVAL;
+ }
+
+ if (!ctx->cell) {
+ pr_warn("kAFS: No cell specified\n");
+ return -EDESTADDRREQ;
+ }
+
+ /* We try to do the mount securely. */
+ key = afs_request_key(ctx->cell);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ ctx->key = key;
+
+ if (ctx->volume) {
+ afs_put_volume(ctx->cell, ctx->volume);
+ ctx->volume = NULL;
+ }
+
+ volume = afs_create_volume(ctx);
+ if (IS_ERR(volume))
+ return PTR_ERR(volume);
+
+ ctx->volume = volume;
+ }
return 0;
}
@@ -348,39 +368,34 @@ static int afs_parse_device_name(struct afs_mount_params *params,
/*
* check a superblock to see if it's the one we're looking for
*/
-static int afs_test_super(struct super_block *sb, void *data)
+static int afs_test_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as1 = data;
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_super_info *as = AFS_FS_S(sb);
- return (as->net_ns == as1->net_ns &&
+ return (as->net_ns == fc->net_ns &&
as->volume &&
- as->volume->vid == as1->volume->vid &&
+ as->volume->vid == ctx->volume->vid &&
!as->dyn_root);
}
-static int afs_dynroot_test_super(struct super_block *sb, void *data)
+static int afs_dynroot_test_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as1 = data;
struct afs_super_info *as = AFS_FS_S(sb);
- return (as->net_ns == as1->net_ns &&
+ return (as->net_ns == fc->net_ns &&
as->dyn_root);
}
-static int afs_set_super(struct super_block *sb, void *data)
+static int afs_set_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as = data;
-
- sb->s_fs_info = as;
return set_anon_super(sb, NULL);
}
/*
* fill in the superblock
*/
-static int afs_fill_super(struct super_block *sb,
- struct afs_mount_params *params)
+static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
{
struct afs_super_info *as = AFS_FS_S(sb);
struct afs_fid fid;
@@ -399,7 +414,7 @@ static int afs_fill_super(struct super_block *sb,
ret = super_setup_bdi(sb);
if (ret)
return ret;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* allocate the root inode and dentry */
if (as->dyn_root) {
@@ -412,13 +427,13 @@ static int afs_fill_super(struct super_block *sb,
fid.vnode = 1;
fid.vnode_hi = 0;
fid.unique = 1;
- inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL);
+ inode = afs_iget(sb, ctx->key, &fid, NULL, NULL, NULL);
}
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (params->autocell || params->dyn_root)
+ if (ctx->autocell || as->dyn_root)
set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
ret = -ENOMEM;
@@ -443,17 +458,20 @@ error:
return ret;
}
-static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params)
+static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc)
{
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_super_info *as;
as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
if (as) {
- as->net_ns = get_net(params->net_ns);
- if (params->dyn_root)
+ as->net_ns = get_net(fc->net_ns);
+ if (ctx->dyn_root) {
as->dyn_root = true;
- else
- as->cell = afs_get_cell(params->cell);
+ } else {
+ as->cell = afs_get_cell(ctx->cell);
+ as->volume = __afs_get_volume(ctx->volume);
+ }
}
return as;
}
@@ -475,7 +493,7 @@ static void afs_kill_super(struct super_block *sb)
if (as->dyn_root)
afs_dynroot_depopulate(sb);
-
+
/* Clear the callback interests (which will do ilookup5) before
* deactivating the superblock.
*/
@@ -488,111 +506,103 @@ static void afs_kill_super(struct super_block *sb)
}
/*
- * get an AFS superblock
+ * Get an AFS superblock and root directory.
*/
-static struct dentry *afs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *options)
+static int afs_get_tree(struct fs_context *fc)
{
- struct afs_mount_params params;
+ struct afs_fs_context *ctx = fc->fs_private;
struct super_block *sb;
- struct afs_volume *candidate;
- struct key *key;
struct afs_super_info *as;
int ret;
- _enter(",,%s,%p", dev_name, options);
-
- memset(&params, 0, sizeof(params));
-
- ret = -EINVAL;
- if (current->nsproxy->net_ns != &init_net)
+ ret = afs_validate_fc(fc);
+ if (ret)
goto error;
- params.net_ns = current->nsproxy->net_ns;
- params.net = afs_net(params.net_ns);
-
- /* parse the options and device name */
- if (options) {
- ret = afs_parse_options(&params, options, &dev_name);
- if (ret < 0)
- goto error;
- }
-
- if (!params.dyn_root) {
- ret = afs_parse_device_name(&params, dev_name);
- if (ret < 0)
- goto error;
- /* try and do the mount securely */
- key = afs_request_key(params.cell);
- if (IS_ERR(key)) {
- _leave(" = %ld [key]", PTR_ERR(key));
- ret = PTR_ERR(key);
- goto error;
- }
- params.key = key;
- }
+ _enter("");
/* allocate a superblock info record */
ret = -ENOMEM;
- as = afs_alloc_sbi(&params);
+ as = afs_alloc_sbi(fc);
if (!as)
- goto error_key;
-
- if (!params.dyn_root) {
- /* Assume we're going to need a volume record; at the very
- * least we can use it to update the volume record if we have
- * one already. This checks that the volume exists within the
- * cell.
- */
- candidate = afs_create_volume(&params);
- if (IS_ERR(candidate)) {
- ret = PTR_ERR(candidate);
- goto error_as;
- }
-
- as->volume = candidate;
- }
+ goto error;
+ fc->s_fs_info = as;
/* allocate a deviceless superblock */
- sb = sget(fs_type,
- as->dyn_root ? afs_dynroot_test_super : afs_test_super,
- afs_set_super, flags, as);
+ sb = sget_fc(fc,
+ as->dyn_root ? afs_dynroot_test_super : afs_test_super,
+ afs_set_super);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
- goto error_as;
+ goto error;
}
if (!sb->s_root) {
/* initial superblock/root creation */
_debug("create");
- ret = afs_fill_super(sb, &params);
+ ret = afs_fill_super(sb, ctx);
if (ret < 0)
goto error_sb;
- as = NULL;
sb->s_flags |= SB_ACTIVE;
} else {
_debug("reuse");
ASSERTCMP(sb->s_flags, &, SB_ACTIVE);
- afs_destroy_sbi(as);
- as = NULL;
}
- afs_put_cell(params.net, params.cell);
- key_put(params.key);
+ fc->root = dget(sb->s_root);
_leave(" = 0 [%p]", sb);
- return dget(sb->s_root);
+ return 0;
error_sb:
deactivate_locked_super(sb);
- goto error_key;
-error_as:
- afs_destroy_sbi(as);
-error_key:
- key_put(params.key);
error:
- afs_put_cell(params.net, params.cell);
_leave(" = %d", ret);
- return ERR_PTR(ret);
+ return ret;
+}
+
+static void afs_free_fc(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx = fc->fs_private;
+
+ afs_destroy_sbi(fc->s_fs_info);
+ afs_put_volume(ctx->cell, ctx->volume);
+ afs_put_cell(ctx->net, ctx->cell);
+ key_put(ctx->key);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations afs_context_ops = {
+ .free = afs_free_fc,
+ .parse_param = afs_parse_param,
+ .get_tree = afs_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int afs_init_fs_context(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx;
+ struct afs_cell *cell;
+
+ ctx = kzalloc(sizeof(struct afs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->type = AFSVL_ROVOL;
+ ctx->net = afs_net(fc->net_ns);
+
+ /* Default to the workstation cell. */
+ rcu_read_lock();
+ cell = afs_lookup_cell_rcu(ctx->net, NULL, 0);
+ rcu_read_unlock();
+ if (IS_ERR(cell))
+ cell = NULL;
+ ctx->cell = cell;
+
+ fc->fs_private = ctx;
+ fc->ops = &afs_context_ops;
+ return 0;
}
/*
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 00975ed3640f..f6eba2def0a1 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -21,7 +21,7 @@ static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" };
/*
* Allocate a volume record and load it up from a vldb record.
*/
-static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params,
+static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
struct afs_vldb_entry *vldb,
unsigned long type_mask)
{
@@ -113,7 +113,7 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell,
* - Rule 3: If parent volume is R/W, then only mount R/W volume unless
* explicitly told otherwise
*/
-struct afs_volume *afs_create_volume(struct afs_mount_params *params)
+struct afs_volume *afs_create_volume(struct afs_fs_context *params)
{
struct afs_vldb_entry *vldb;
struct afs_volume *volume;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 72efcfcf9f95..0122d7445fba 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -264,6 +264,7 @@ static void afs_kill_pages(struct address_space *mapping,
first = page->index + 1;
lock_page(page);
generic_error_remove_page(mapping, page);
+ unlock_page(page);
}
__pagevec_release(&pv);
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 5aa57929e8c2..6e97a42d24d1 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -1514,7 +1514,7 @@ static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &vnode->fid);
bp = xdr_encode_YFS_StoreStatus(bp, attr);
- bp = xdr_encode_u64(bp, 0); /* position of start of write */
+ bp = xdr_encode_u64(bp, attr->ia_size); /* position of start of write */
bp = xdr_encode_u64(bp, 0); /* size of write */
bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */
yfs_check_req(call, bp);
diff --git a/fs/aio.c b/fs/aio.c
index 38b741aef0bf..3490d1fa0e16 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -181,7 +181,7 @@ struct poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
- bool woken;
+ bool done;
bool cancelled;
struct wait_queue_entry wait;
struct work_struct work;
@@ -204,8 +204,7 @@ struct aio_kiocb {
struct kioctx *ki_ctx;
kiocb_cancel_fn *ki_cancel;
- struct iocb __user *ki_user_iocb; /* user's aiocb */
- __u64 ki_user_data; /* user's data for completion */
+ struct io_event ki_res;
struct list_head ki_list; /* the aio core uses this
* for cancellation */
@@ -1022,6 +1021,9 @@ static bool get_reqs_available(struct kioctx *ctx)
/* aio_get_req
* Allocate a slot for an aio request.
* Returns NULL if no requests are free.
+ *
+ * The refcount is initialized to 2 - one for the async op completion,
+ * one for the synchronous code that does this.
*/
static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
{
@@ -1031,10 +1033,15 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
if (unlikely(!req))
return NULL;
+ if (unlikely(!get_reqs_available(ctx))) {
+ kmem_cache_free(kiocb_cachep, req);
+ return NULL;
+ }
+
percpu_ref_get(&ctx->reqs);
req->ki_ctx = ctx;
INIT_LIST_HEAD(&req->ki_list);
- refcount_set(&req->ki_refcnt, 0);
+ refcount_set(&req->ki_refcnt, 2);
req->ki_eventfd = NULL;
return req;
}
@@ -1067,30 +1074,20 @@ out:
return ret;
}
-static inline void iocb_put(struct aio_kiocb *iocb)
-{
- if (refcount_read(&iocb->ki_refcnt) == 0 ||
- refcount_dec_and_test(&iocb->ki_refcnt)) {
- if (iocb->ki_filp)
- fput(iocb->ki_filp);
- percpu_ref_put(&iocb->ki_ctx->reqs);
- kmem_cache_free(kiocb_cachep, iocb);
- }
-}
-
-static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb,
- long res, long res2)
+static inline void iocb_destroy(struct aio_kiocb *iocb)
{
- ev->obj = (u64)(unsigned long)iocb->ki_user_iocb;
- ev->data = iocb->ki_user_data;
- ev->res = res;
- ev->res2 = res2;
+ if (iocb->ki_eventfd)
+ eventfd_ctx_put(iocb->ki_eventfd);
+ if (iocb->ki_filp)
+ fput(iocb->ki_filp);
+ percpu_ref_put(&iocb->ki_ctx->reqs);
+ kmem_cache_free(kiocb_cachep, iocb);
}
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
-static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
+static void aio_complete(struct aio_kiocb *iocb)
{
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
@@ -1114,14 +1111,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
event = ev_page + pos % AIO_EVENTS_PER_PAGE;
- aio_fill_event(event, iocb, res, res2);
+ *event = iocb->ki_res;
kunmap_atomic(ev_page);
flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
- pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
- ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
- res, res2);
+ pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
+ (void __user *)(unsigned long)iocb->ki_res.obj,
+ iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2);
/* after flagging the request as done, we
* must never even look at it again
@@ -1148,10 +1145,8 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
* eventfd. The eventfd_signal() function is safe to be called
* from IRQ context.
*/
- if (iocb->ki_eventfd) {
+ if (iocb->ki_eventfd)
eventfd_signal(iocb->ki_eventfd, 1);
- eventfd_ctx_put(iocb->ki_eventfd);
- }
/*
* We have to order our ring_info tail store above and test
@@ -1163,7 +1158,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
- iocb_put(iocb);
+}
+
+static inline void iocb_put(struct aio_kiocb *iocb)
+{
+ if (refcount_dec_and_test(&iocb->ki_refcnt)) {
+ aio_complete(iocb);
+ iocb_destroy(iocb);
+ }
}
/* aio_read_events_ring
@@ -1437,7 +1439,9 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
file_end_write(kiocb->ki_filp);
}
- aio_complete(iocb, res, res2);
+ iocb->ki_res.res = res;
+ iocb->ki_res.res2 = res2;
+ iocb_put(iocb);
}
static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
@@ -1514,13 +1518,13 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
}
}
-static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
+static int aio_read(struct kiocb *req, const struct iocb *iocb,
bool vectored, bool compat)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
struct file *file;
- ssize_t ret;
+ int ret;
ret = aio_prep_rw(req, iocb);
if (ret)
@@ -1542,13 +1546,13 @@ static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
return ret;
}
-static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
+static int aio_write(struct kiocb *req, const struct iocb *iocb,
bool vectored, bool compat)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
struct file *file;
- ssize_t ret;
+ int ret;
ret = aio_prep_rw(req, iocb);
if (ret)
@@ -1585,11 +1589,10 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
static void aio_fsync_work(struct work_struct *work)
{
- struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
- int ret;
+ struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
- ret = vfs_fsync(req->file, req->datasync);
- aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
+ iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
+ iocb_put(iocb);
}
static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
@@ -1608,11 +1611,6 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
return 0;
}
-static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
-{
- aio_complete(iocb, mangle_poll(mask), 0);
-}
-
static void aio_poll_complete_work(struct work_struct *work)
{
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1638,9 +1636,11 @@ static void aio_poll_complete_work(struct work_struct *work)
return;
}
list_del_init(&iocb->ki_list);
+ iocb->ki_res.res = mangle_poll(mask);
+ req->done = true;
spin_unlock_irq(&ctx->ctx_lock);
- aio_poll_complete(iocb, mask);
+ iocb_put(iocb);
}
/* assumes we are called with irqs disabled */
@@ -1668,31 +1668,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
__poll_t mask = key_to_poll(key);
unsigned long flags;
- req->woken = true;
-
/* for instances that support it check for an event match first: */
- if (mask) {
- if (!(mask & req->events))
- return 0;
+ if (mask && !(mask & req->events))
+ return 0;
+
+ list_del_init(&req->wait.entry);
+ if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
/*
* Try to complete the iocb inline if we can. Use
* irqsave/irqrestore because not all filesystems (e.g. fuse)
* call this function with IRQs disabled and because IRQs
* have to be disabled before ctx_lock is obtained.
*/
- if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
- list_del(&iocb->ki_list);
- spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
-
- list_del_init(&req->wait.entry);
- aio_poll_complete(iocb, mask);
- return 1;
- }
+ list_del(&iocb->ki_list);
+ iocb->ki_res.res = mangle_poll(mask);
+ req->done = true;
+ spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
+ iocb_put(iocb);
+ } else {
+ schedule_work(&req->work);
}
-
- list_del_init(&req->wait.entry);
- schedule_work(&req->work);
return 1;
}
@@ -1719,11 +1715,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
add_wait_queue(head, &pt->iocb->poll.wait);
}
-static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
+static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
{
struct kioctx *ctx = aiocb->ki_ctx;
struct poll_iocb *req = &aiocb->poll;
struct aio_poll_table apt;
+ bool cancel = false;
__poll_t mask;
/* reject any unknown events outside the normal event mask. */
@@ -1737,7 +1734,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
req->head = NULL;
- req->woken = false;
+ req->done = false;
req->cancelled = false;
apt.pt._qproc = aio_poll_queue_proc;
@@ -1749,156 +1746,135 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
INIT_LIST_HEAD(&req->wait.entry);
init_waitqueue_func_entry(&req->wait, aio_poll_wake);
- /* one for removal from waitqueue, one for this function */
- refcount_set(&aiocb->ki_refcnt, 2);
-
mask = vfs_poll(req->file, &apt.pt) & req->events;
- if (unlikely(!req->head)) {
- /* we did not manage to set up a waitqueue, done */
- goto out;
- }
-
spin_lock_irq(&ctx->ctx_lock);
- spin_lock(&req->head->lock);
- if (req->woken) {
- /* wake_up context handles the rest */
- mask = 0;
+ if (likely(req->head)) {
+ spin_lock(&req->head->lock);
+ if (unlikely(list_empty(&req->wait.entry))) {
+ if (apt.error)
+ cancel = true;
+ apt.error = 0;
+ mask = 0;
+ }
+ if (mask || apt.error) {
+ list_del_init(&req->wait.entry);
+ } else if (cancel) {
+ WRITE_ONCE(req->cancelled, true);
+ } else if (!req->done) { /* actually waiting for an event */
+ list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+ aiocb->ki_cancel = aio_poll_cancel;
+ }
+ spin_unlock(&req->head->lock);
+ }
+ if (mask) { /* no async, we'd stolen it */
+ aiocb->ki_res.res = mangle_poll(mask);
apt.error = 0;
- } else if (mask || apt.error) {
- /* if we get an error or a mask we are done */
- WARN_ON_ONCE(list_empty(&req->wait.entry));
- list_del_init(&req->wait.entry);
- } else {
- /* actually waiting for an event */
- list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
- aiocb->ki_cancel = aio_poll_cancel;
}
- spin_unlock(&req->head->lock);
spin_unlock_irq(&ctx->ctx_lock);
-
-out:
- if (unlikely(apt.error))
- return apt.error;
-
if (mask)
- aio_poll_complete(aiocb, mask);
- iocb_put(aiocb);
- return 0;
+ iocb_put(aiocb);
+ return apt.error;
}
static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
- struct iocb __user *user_iocb, bool compat)
+ struct iocb __user *user_iocb, struct aio_kiocb *req,
+ bool compat)
{
- struct aio_kiocb *req;
- ssize_t ret;
-
- /* enforce forwards compatibility on users */
- if (unlikely(iocb->aio_reserved2)) {
- pr_debug("EINVAL: reserve field set\n");
- return -EINVAL;
- }
-
- /* prevent overflows */
- if (unlikely(
- (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
- (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
- ((ssize_t)iocb->aio_nbytes < 0)
- )) {
- pr_debug("EINVAL: overflow check\n");
- return -EINVAL;
- }
-
- if (!get_reqs_available(ctx))
- return -EAGAIN;
-
- ret = -EAGAIN;
- req = aio_get_req(ctx);
- if (unlikely(!req))
- goto out_put_reqs_available;
-
req->ki_filp = fget(iocb->aio_fildes);
- ret = -EBADF;
if (unlikely(!req->ki_filp))
- goto out_put_req;
+ return -EBADF;
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
+ struct eventfd_ctx *eventfd;
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
* instance of the file* now. The file descriptor must be
* an eventfd() fd, and will be signaled for each completed
* event using the eventfd_signal() function.
*/
- req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
- if (IS_ERR(req->ki_eventfd)) {
- ret = PTR_ERR(req->ki_eventfd);
- req->ki_eventfd = NULL;
- goto out_put_req;
- }
+ eventfd = eventfd_ctx_fdget(iocb->aio_resfd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ req->ki_eventfd = eventfd;
}
- ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
- if (unlikely(ret)) {
+ if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
pr_debug("EFAULT: aio_key\n");
- goto out_put_req;
+ return -EFAULT;
}
- req->ki_user_iocb = user_iocb;
- req->ki_user_data = iocb->aio_data;
+ req->ki_res.obj = (u64)(unsigned long)user_iocb;
+ req->ki_res.data = iocb->aio_data;
+ req->ki_res.res = 0;
+ req->ki_res.res2 = 0;
switch (iocb->aio_lio_opcode) {
case IOCB_CMD_PREAD:
- ret = aio_read(&req->rw, iocb, false, compat);
- break;
+ return aio_read(&req->rw, iocb, false, compat);
case IOCB_CMD_PWRITE:
- ret = aio_write(&req->rw, iocb, false, compat);
- break;
+ return aio_write(&req->rw, iocb, false, compat);
case IOCB_CMD_PREADV:
- ret = aio_read(&req->rw, iocb, true, compat);
- break;
+ return aio_read(&req->rw, iocb, true, compat);
case IOCB_CMD_PWRITEV:
- ret = aio_write(&req->rw, iocb, true, compat);
- break;
+ return aio_write(&req->rw, iocb, true, compat);
case IOCB_CMD_FSYNC:
- ret = aio_fsync(&req->fsync, iocb, false);
- break;
+ return aio_fsync(&req->fsync, iocb, false);
case IOCB_CMD_FDSYNC:
- ret = aio_fsync(&req->fsync, iocb, true);
- break;
+ return aio_fsync(&req->fsync, iocb, true);
case IOCB_CMD_POLL:
- ret = aio_poll(req, iocb);
- break;
+ return aio_poll(req, iocb);
default:
pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
- ret = -EINVAL;
- break;
+ return -EINVAL;
}
-
- /*
- * If ret is 0, we'd either done aio_complete() ourselves or have
- * arranged for that to be done asynchronously. Anything non-zero
- * means that we need to destroy req ourselves.
- */
- if (ret)
- goto out_put_req;
- return 0;
-out_put_req:
- if (req->ki_eventfd)
- eventfd_ctx_put(req->ki_eventfd);
- iocb_put(req);
-out_put_reqs_available:
- put_reqs_available(ctx, 1);
- return ret;
}
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
bool compat)
{
+ struct aio_kiocb *req;
struct iocb iocb;
+ int err;
if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
return -EFAULT;
- return __io_submit_one(ctx, &iocb, user_iocb, compat);
+ /* enforce forwards compatibility on users */
+ if (unlikely(iocb.aio_reserved2)) {
+ pr_debug("EINVAL: reserve field set\n");
+ return -EINVAL;
+ }
+
+ /* prevent overflows */
+ if (unlikely(
+ (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
+ (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
+ ((ssize_t)iocb.aio_nbytes < 0)
+ )) {
+ pr_debug("EINVAL: overflow check\n");
+ return -EINVAL;
+ }
+
+ req = aio_get_req(ctx);
+ if (unlikely(!req))
+ return -EAGAIN;
+
+ err = __io_submit_one(ctx, &iocb, user_iocb, req, compat);
+
+ /* Done with the synchronous reference */
+ iocb_put(req);
+
+ /*
+ * If err is 0, we'd either done aio_complete() ourselves or have
+ * arranged for that to be done asynchronously. Anything non-zero
+ * means that we need to destroy req ourselves.
+ */
+ if (unlikely(err)) {
+ iocb_destroy(req);
+ put_reqs_available(ctx, 1);
+ }
+ return err;
}
/* sys_io_submit:
@@ -1997,24 +1973,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
}
#endif
-/* lookup_kiocb
- * Finds a given iocb for cancellation.
- */
-static struct aio_kiocb *
-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
-{
- struct aio_kiocb *kiocb;
-
- assert_spin_locked(&ctx->ctx_lock);
-
- /* TODO: use a hash or array, this sucks. */
- list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
- if (kiocb->ki_user_iocb == iocb)
- return kiocb;
- }
- return NULL;
-}
-
/* sys_io_cancel:
* Attempts to cancel an iocb previously passed to io_submit. If
* the operation is successfully cancelled, the resulting event is
@@ -2032,6 +1990,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
struct aio_kiocb *kiocb;
int ret = -EINVAL;
u32 key;
+ u64 obj = (u64)(unsigned long)iocb;
if (unlikely(get_user(key, &iocb->aio_key)))
return -EFAULT;
@@ -2043,10 +2002,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
- kiocb = lookup_kiocb(ctx, iocb);
- if (kiocb) {
- ret = kiocb->ki_cancel(&kiocb->rw);
- list_del_init(&kiocb->ki_list);
+ /* TODO: use a hash or array, this sucks. */
+ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+ if (kiocb->ki_res.obj == obj) {
+ ret = kiocb->ki_cancel(&kiocb->rw);
+ list_del_init(&kiocb->ki_list);
+ break;
+ }
}
spin_unlock_irq(&ctx->ctx_lock);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e9faa52bb489..bb28e2ead679 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -264,7 +264,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio_for_each_segment_all(bvec, &bio, i, iter_all) {
if (should_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
- put_page(bvec->bv_page);
+ if (!bio_flagged(&bio, BIO_NO_PAGE_REF))
+ put_page(bvec->bv_page);
}
if (unlikely(bio.bi_status))
@@ -307,10 +308,10 @@ static void blkdev_bio_end_io(struct bio *bio)
struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->should_dirty;
- if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
- if (bio->bi_status && !dio->bio.bi_status)
- dio->bio.bi_status = bio->bi_status;
- } else {
+ if (bio->bi_status && !dio->bio.bi_status)
+ dio->bio.bi_status = bio->bi_status;
+
+ if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
if (!dio->is_sync) {
struct kiocb *iocb = dio->iocb;
ssize_t ret;
@@ -336,12 +337,14 @@ static void blkdev_bio_end_io(struct bio *bio)
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
- struct bio_vec *bvec;
- int i;
- struct bvec_iter_all iter_all;
+ if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+ struct bvec_iter_all iter_all;
+ struct bio_vec *bvec;
+ int i;
- bio_for_each_segment_all(bvec, bio, i, iter_all)
- put_page(bvec->bv_page);
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
+ put_page(bvec->bv_page);
+ }
bio_put(bio);
}
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 129d26226e70..b3642367a595 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1210,6 +1210,8 @@ enum {
* Set for the subvolume tree owning the reloc tree.
*/
BTRFS_ROOT_DEAD_RELOC_TREE,
+ /* Mark dead root stored on device whose cleanup needs to be resumed */
+ BTRFS_ROOT_DEAD_TREE,
};
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f0cdb53f3e2d..6fe9197f6ee4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2958,7 +2958,7 @@ int open_ctree(struct super_block *sb,
sb->s_bdi->congested_fn = btrfs_congested_fn;
sb->s_bdi->congested_data = fs_info;
sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 994f0cc41799..c5880329ae37 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6174,7 +6174,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
*
* This is overestimating in most cases.
*/
- qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
+ qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
spin_lock(&block_rsv->lock);
block_rsv->size = reserve_size;
@@ -8764,6 +8764,8 @@ struct walk_control {
u64 refs[BTRFS_MAX_LEVEL];
u64 flags[BTRFS_MAX_LEVEL];
struct btrfs_key update_progress;
+ struct btrfs_key drop_progress;
+ int drop_level;
int stage;
int level;
int shared_level;
@@ -8771,6 +8773,7 @@ struct walk_control {
int keep_locks;
int reada_slot;
int reada_count;
+ int restarted;
};
#define DROP_REFERENCE 1
@@ -8934,6 +8937,33 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
}
/*
+ * This is used to verify a ref exists for this root to deal with a bug where we
+ * would have a drop_progress key that hadn't been updated properly.
+ */
+static int check_ref_exists(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 parent,
+ int level)
+{
+ struct btrfs_path *path;
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = lookup_extent_backref(trans, path, &iref, bytenr,
+ root->fs_info->nodesize, parent,
+ root->root_key.objectid, level, 0);
+ btrfs_free_path(path);
+ if (ret == -ENOENT)
+ return 0;
+ if (ret < 0)
+ return ret;
+ return 1;
+}
+
+/*
* helper to process tree block pointer.
*
* when wc->stage == DROP_REFERENCE, this function checks
@@ -9088,6 +9118,23 @@ skip:
}
/*
+ * If we had a drop_progress we need to verify the refs are set
+ * as expected. If we find our ref then we know that from here
+ * on out everything should be correct, and we can clear the
+ * ->restarted flag.
+ */
+ if (wc->restarted) {
+ ret = check_ref_exists(trans, root, bytenr, parent,
+ level - 1);
+ if (ret < 0)
+ goto out_unlock;
+ if (ret == 0)
+ goto no_delete;
+ ret = 0;
+ wc->restarted = 0;
+ }
+
+ /*
* Reloc tree doesn't contribute to qgroup numbers, and we have
* already accounted them at merge time (replace_path),
* thus we could skip expensive subtree trace here.
@@ -9102,13 +9149,23 @@ skip:
ret);
}
}
+
+ /*
+ * We need to update the next key in our walk control so we can
+ * update the drop_progress key accordingly. We don't care if
+ * find_next_key doesn't find a key because that means we're at
+ * the end and are going to clean up now.
+ */
+ wc->drop_level = level;
+ find_next_key(path, level, &wc->drop_progress);
+
ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,
parent, root->root_key.objectid,
level - 1, 0);
if (ret)
goto out_unlock;
}
-
+no_delete:
*lookup_info = 1;
ret = 1;
@@ -9425,6 +9482,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
}
+ wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
wc->level = level;
wc->shared_level = -1;
wc->stage = DROP_REFERENCE;
@@ -9452,12 +9510,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (wc->stage == DROP_REFERENCE) {
- level = wc->level;
- btrfs_node_key(path->nodes[level],
- &root_item->drop_progress,
- path->slots[level]);
- root_item->drop_level = level;
- }
+ wc->drop_level = wc->level;
+ btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
+ &wc->drop_progress,
+ path->slots[wc->drop_level]);
+ }
+ btrfs_cpu_key_to_disk(&root_item->drop_progress,
+ &wc->drop_progress);
+ root_item->drop_level = wc->drop_level;
BUG_ON(wc->level == 0);
if (btrfs_should_end_transaction(trans) ||
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ab705183d749..ca8b8e785cf3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2995,11 +2995,11 @@ static int __do_readpage(struct extent_io_tree *tree,
*/
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->orig_start)
+ *prev_em_start != em->start)
force_bio_submit = true;
if (prev_em_start)
- *prev_em_start = em->orig_start;
+ *prev_em_start = em->start;
free_extent_map(em);
em = NULL;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 920bf3b4b0ef..cccc75d15970 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -427,9 +428,13 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
unsigned long this_sum_bytes = 0;
int i;
u64 offset;
+ unsigned nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
+ sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
+ GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
- sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
- GFP_NOFS);
if (!sums)
return BLK_STS_RESOURCE;
@@ -472,8 +477,10 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
bytes_left = bio->bi_iter.bi_size - total_bytes;
- sums = kzalloc(btrfs_ordered_sum_size(fs_info, bytes_left),
- GFP_NOFS);
+ nofs_flag = memalloc_nofs_save();
+ sums = kvzalloc(btrfs_ordered_sum_size(fs_info,
+ bytes_left), GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
BUG_ON(!sums); /* -ENOMEM */
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 82fdda8ff5ab..2973608824ec 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6783,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
- u8 extent_type;
+ int extent_type = -1;
struct btrfs_path *path = NULL;
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *item;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 494f0f10d70e..cd4e693406a0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -501,6 +501,16 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ /*
+ * If the fs is mounted with nologreplay, which requires it to be
+ * mounted in RO mode as well, we can not allow discard on free space
+ * inside block groups, because log trees refer to extents that are not
+ * pinned in a block group's free space cache (pinning the extents is
+ * precisely the first phase of replaying a log tree).
+ */
+ if (btrfs_test_opt(fs_info, NOLOGREPLAY))
+ return -EROFS;
+
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
dev_list) {
@@ -3207,21 +3217,6 @@ out:
return ret;
}
-static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
-{
- inode_unlock(inode1);
- inode_unlock(inode2);
-}
-
-static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
-{
- if (inode1 < inode2)
- swap(inode1, inode2);
-
- inode_lock_nested(inode1, I_MUTEX_PARENT);
- inode_lock_nested(inode2, I_MUTEX_CHILD);
-}
-
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
@@ -3956,7 +3951,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (same_inode)
inode_lock(inode_in);
else
- btrfs_double_inode_lock(inode_in, inode_out);
+ lock_two_nondirectories(inode_in, inode_out);
/* don't make the dst file partly checksummed */
if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
@@ -4013,7 +4008,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (same_inode)
inode_unlock(inode_in);
else
- btrfs_double_inode_unlock(inode_in, inode_out);
+ unlock_two_nondirectories(inode_in, inode_out);
return ret;
}
@@ -4043,7 +4038,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
if (same_inode)
inode_unlock(src_inode);
else
- btrfs_double_inode_unlock(src_inode, dst_inode);
+ unlock_two_nondirectories(src_inode, dst_inode);
return ret < 0 ? ret : len;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6fde2b2741ef..45e3cfd1198b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
@@ -442,7 +443,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
list_del(&sum->list);
- kfree(sum);
+ kvfree(sum);
}
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index dc6140013ae8..61d22a56c0ba 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -366,11 +366,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
static int prop_compression_validate(const char *value, size_t len)
{
- if (!strncmp("lzo", value, len))
+ if (!strncmp("lzo", value, 3))
return 0;
- else if (!strncmp("zlib", value, len))
+ else if (!strncmp("zlib", value, 4))
return 0;
- else if (!strncmp("zstd", value, len))
+ else if (!strncmp("zstd", value, 4))
return 0;
return -EINVAL;
@@ -396,7 +396,7 @@ static int prop_compression_apply(struct inode *inode,
btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
} else if (!strncmp("zlib", value, 4)) {
type = BTRFS_COMPRESS_ZLIB;
- } else if (!strncmp("zstd", value, len)) {
+ } else if (!strncmp("zstd", value, 4)) {
type = BTRFS_COMPRESS_ZSTD;
btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
} else {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c1cd5558a646..e659d9d61107 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -894,6 +894,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
if (fs_info->quota_root)
goto out;
+ fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
@@ -909,13 +915,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
goto out;
}
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
- ret = -ENOMEM;
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
-
/*
* initially create the quota tree
*/
@@ -1923,8 +1922,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
int i;
/* Level sanity check */
- if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL ||
- root_level < 0 || root_level >= BTRFS_MAX_LEVEL ||
+ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
root_level < cur_level) {
btrfs_err_rl(fs_info,
"%s: bad levels, cur_level=%d root_level=%d",
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 1869ba8e5981..67a6f7d47402 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2430,8 +2430,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
bitmap_clear(rbio->dbitmap, pagenr, 1);
kunmap(p);
- for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+ for (stripe = 0; stripe < nr_data; stripe++)
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ kunmap(p_page);
}
__free_page(p_page);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index d09b6cdb785a..b283d3a6e837 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -205,28 +205,17 @@ static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid)
#ifdef CONFIG_STACKTRACE
static void __save_stack_trace(struct ref_action *ra)
{
- struct stack_trace stack_trace;
-
- stack_trace.max_entries = MAX_TRACE;
- stack_trace.nr_entries = 0;
- stack_trace.entries = ra->trace;
- stack_trace.skip = 2;
- save_stack_trace(&stack_trace);
- ra->trace_len = stack_trace.nr_entries;
+ ra->trace_len = stack_trace_save(ra->trace, MAX_TRACE, 2);
}
static void __print_stack_trace(struct btrfs_fs_info *fs_info,
struct ref_action *ra)
{
- struct stack_trace trace;
-
if (ra->trace_len == 0) {
btrfs_err(fs_info, " ref-verify: no stacktrace");
return;
}
- trace.nr_entries = ra->trace_len;
- trace.entries = ra->trace;
- print_stack_trace(&trace, 2);
+ stack_trace_print(ra->trace, ra->trace_len, 2);
}
#else
static void inline __save_stack_trace(struct ref_action *ra)
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0d2b957ca3a3..893d12fbfda0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -263,8 +263,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
if (root) {
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
&root->state));
- if (btrfs_root_refs(&root->root_item) == 0)
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
+ }
continue;
}
@@ -310,8 +312,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
break;
}
- if (btrfs_root_refs(&root->root_item) == 0)
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
+ }
}
btrfs_free_path(path);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index acdad6d658f5..e4e665f422fc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1886,8 +1886,10 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
}
}
-static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
/*
* We use writeback_inodes_sb here because if we used
* btrfs_start_delalloc_roots we would deadlock with fs freeze.
@@ -1897,15 +1899,50 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
* from already being in a transaction and our join_transaction doesn't
* have to re-take the fs freeze lock.
*/
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
+ } else {
+ struct btrfs_pending_snapshot *pending;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+
+ /*
+ * Flush dellaloc for any root that is going to be snapshotted.
+ * This is done to avoid a corrupted version of files, in the
+ * snapshots, that had both buffered and direct IO writes (even
+ * if they were done sequentially) due to an unordered update of
+ * the inode's size on disk.
+ */
+ list_for_each_entry(pending, head, list) {
+ int ret;
+
+ ret = btrfs_start_delalloc_snapshot(pending->root);
+ if (ret)
+ return ret;
+ }
+ }
return 0;
}
-static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans)
{
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ } else {
+ struct btrfs_pending_snapshot *pending;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+
+ /*
+ * Wait for any dellaloc that we started previously for the roots
+ * that are going to be snapshotted. This is to avoid a corrupted
+ * version of files in the snapshots that had both buffered and
+ * direct IO writes (even if they were done sequentially).
+ */
+ list_for_each_entry(pending, head, list)
+ btrfs_wait_ordered_extents(pending->root,
+ U64_MAX, 0, U64_MAX);
+ }
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -2023,7 +2060,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
extwriter_counter_dec(cur_trans, trans->type);
- ret = btrfs_start_delalloc_flush(fs_info);
+ ret = btrfs_start_delalloc_flush(trans);
if (ret)
goto cleanup_transaction;
@@ -2039,7 +2076,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto cleanup_transaction;
- btrfs_wait_delalloc_flush(fs_info);
+ btrfs_wait_delalloc_flush(trans);
btrfs_scrub_pause(fs_info);
/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f06454a55e00..561884f60d35 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3578,9 +3578,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- /* find the first key from this transaction again */
+ /*
+ * Find the first key from this transaction again. See the note for
+ * log_new_dir_dentries, if we're logging a directory recursively we
+ * won't be holding its i_mutex, which means we can modify the directory
+ * while we're logging it. If we remove an entry between our first
+ * search and this search we'll not find the key again and can just
+ * bail.
+ */
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
- if (WARN_ON(ret != 0))
+ if (ret != 0)
goto done;
/*
@@ -4544,6 +4551,19 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
*size_ret = btrfs_inode_size(path->nodes[0], item);
+ /*
+ * If the in-memory inode's i_size is smaller then the inode
+ * size stored in the btree, return the inode's i_size, so
+ * that we get a correct inode size after replaying the log
+ * when before a power failure we had a shrinking truncate
+ * followed by addition of a new name (rename / new hard link).
+ * Otherwise return the inode size from the btree, to avoid
+ * data loss when replaying a log due to previously doing a
+ * write that expands the inode's size and logging a new name
+ * immediately after.
+ */
+ if (*size_ret > inode->vfs_inode.i_size)
+ *size_ret = inode->vfs_inode.i_size;
}
btrfs_release_path(path);
@@ -4705,15 +4725,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(leaf, extent);
- ASSERT(len == i_size ||
- (len == fs_info->sectorsize &&
- btrfs_file_extent_compression(leaf, extent) !=
- BTRFS_COMPRESS_NONE) ||
- (len < i_size && i_size < fs_info->sectorsize));
+ BTRFS_FILE_EXTENT_INLINE)
return 0;
- }
len = btrfs_file_extent_num_bytes(leaf, extent);
/* Last extent goes beyond i_size, no need to log a hole. */
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9024eee889b9..db934ceae9c1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6407,7 +6407,7 @@ static void btrfs_end_bio(struct bio *bio)
if (bio_op(bio) == REQ_OP_WRITE)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
- else
+ else if (!(bio->bi_opf & REQ_RAHEAD))
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 3e418a3aeb11..6b9e29d050f3 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -195,8 +195,7 @@ static void zstd_cleanup_workspace_manager(void)
struct workspace *workspace;
int i;
- del_timer(&wsm.timer);
-
+ spin_lock(&wsm.lock);
for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
while (!list_empty(&wsm.idle_ws[i])) {
workspace = container_of(wsm.idle_ws[i].next,
@@ -206,6 +205,9 @@ static void zstd_cleanup_workspace_manager(void)
wsm.ops->free_workspace(&workspace->list);
}
}
+ spin_unlock(&wsm.lock);
+
+ del_timer_sync(&wsm.timer);
}
/*
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index bba28a5034ba..36a8dc699448 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -148,11 +148,17 @@ void ceph_caps_finalize(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->caps_list_lock);
}
-void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
+void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+ struct ceph_mount_options *fsopt)
{
spin_lock(&mdsc->caps_list_lock);
- mdsc->caps_min_count += delta;
- BUG_ON(mdsc->caps_min_count < 0);
+ mdsc->caps_min_count = fsopt->max_readdir;
+ if (mdsc->caps_min_count < 1024)
+ mdsc->caps_min_count = 1024;
+ mdsc->caps_use_max = fsopt->caps_max;
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_max < mdsc->caps_min_count)
+ mdsc->caps_use_max = mdsc->caps_min_count;
spin_unlock(&mdsc->caps_list_lock);
}
@@ -272,6 +278,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
if (!err) {
BUG_ON(have + alloc != need);
ctx->count = need;
+ ctx->used = 0;
}
spin_lock(&mdsc->caps_list_lock);
@@ -295,13 +302,24 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
}
void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx)
+ struct ceph_cap_reservation *ctx)
{
+ bool reclaim = false;
+ if (!ctx->count)
+ return;
+
dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
spin_lock(&mdsc->caps_list_lock);
__ceph_unreserve_caps(mdsc, ctx->count);
ctx->count = 0;
+
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_count > mdsc->caps_use_max)
+ reclaim = true;
spin_unlock(&mdsc->caps_list_lock);
+
+ if (reclaim)
+ ceph_reclaim_caps_nr(mdsc, ctx->used);
}
struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
@@ -346,6 +364,7 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
BUG_ON(list_empty(&mdsc->caps_list));
ctx->count--;
+ ctx->used++;
mdsc->caps_reserve_count--;
mdsc->caps_use_count++;
@@ -500,12 +519,12 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- struct ceph_mount_options *ma = mdsc->fsc->mount_options;
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
ci->i_hold_caps_min = round_jiffies(jiffies +
- ma->caps_wanted_delay_min * HZ);
+ opt->caps_wanted_delay_min * HZ);
ci->i_hold_caps_max = round_jiffies(jiffies +
- ma->caps_wanted_delay_max * HZ);
+ opt->caps_wanted_delay_max * HZ);
dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
}
@@ -657,6 +676,10 @@ void ceph_add_cap(struct inode *inode,
session->s_nr_caps++;
spin_unlock(&session->s_cap_lock);
} else {
+ spin_lock(&session->s_cap_lock);
+ list_move_tail(&cap->session_caps, &session->s_caps);
+ spin_unlock(&session->s_cap_lock);
+
if (cap->cap_gen < session->s_cap_gen)
cap->issued = cap->implemented = CEPH_CAP_PIN;
@@ -1081,9 +1104,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
(!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
cap->queue_release = 1;
if (removed) {
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
+ __ceph_queue_cap_release(session, cap);
removed = 0;
}
} else {
@@ -1245,7 +1266,7 @@ static int send_cap_msg(struct cap_msg_args *arg)
* Queue cap releases when an inode is dropped from our cache. Since
* inode is about to be destroyed, there is no need for i_ceph_lock.
*/
-void ceph_queue_caps_release(struct inode *inode)
+void __ceph_remove_caps(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct rb_node *p;
@@ -2393,6 +2414,12 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
if ((cap->issued & ci->i_flushing_caps) !=
ci->i_flushing_caps) {
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ /* encode_caps_cb() also will reset these sequence
+ * numbers. make sure sequence numbers in cap flush
+ * message match later reconnect message */
+ cap->seq = 0;
+ cap->issue_seq = 0;
+ cap->mseq = 0;
__kick_flushing_caps(mdsc, session, ci,
oldest_flush_tid);
} else {
@@ -3880,12 +3907,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
cap->seq = seq;
cap->issue_seq = seq;
spin_lock(&session->s_cap_lock);
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
+ __ceph_queue_cap_release(session, cap);
spin_unlock(&session->s_cap_lock);
}
- goto flush_cap_releases;
+ goto done;
}
/* these will work even if we don't have a cap yet */
@@ -3955,7 +3980,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
ceph_cap_op_name(op));
}
- goto done;
+done:
+ mutex_unlock(&session->s_mutex);
+done_unlocked:
+ iput(inode);
+ ceph_put_string(extra_info.pool_ns);
+ return;
flush_cap_releases:
/*
@@ -3963,14 +3993,8 @@ flush_cap_releases:
* along for the mds (who clearly thinks we still have this
* cap).
*/
- ceph_send_cap_releases(mdsc, session);
-
-done:
- mutex_unlock(&session->s_mutex);
-done_unlocked:
- iput(inode);
- ceph_put_string(extra_info.pool_ns);
- return;
+ ceph_flush_cap_releases(mdsc, session);
+ goto done;
bad:
pr_err("ceph_handle_caps: corrupt message\n");
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index abdf98deeec4..98365e74cb4a 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -139,23 +139,6 @@ static int caps_show(struct seq_file *s, void *p)
return 0;
}
-static int dentry_lru_show(struct seq_file *s, void *ptr)
-{
- struct ceph_fs_client *fsc = s->private;
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_dentry_info *di;
-
- spin_lock(&mdsc->dentry_lru_lock);
- list_for_each_entry(di, &mdsc->dentry_lru, lru) {
- struct dentry *dentry = di->dentry;
- seq_printf(s, "%p %p\t%pd\n",
- di, dentry, dentry);
- }
- spin_unlock(&mdsc->dentry_lru_lock);
-
- return 0;
-}
-
static int mds_sessions_show(struct seq_file *s, void *ptr)
{
struct ceph_fs_client *fsc = s->private;
@@ -195,7 +178,6 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
CEPH_DEFINE_SHOW_FUNC(mdsc_show)
CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
@@ -231,7 +213,6 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
debugfs_remove(fsc->debugfs_mds_sessions);
debugfs_remove(fsc->debugfs_caps);
debugfs_remove(fsc->debugfs_mdsc);
- debugfs_remove(fsc->debugfs_dentry_lru);
}
int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
@@ -291,14 +272,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
if (!fsc->debugfs_caps)
goto out;
- fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
- 0400,
- fsc->client->debugfs_dir,
- fsc,
- &dentry_lru_show_fops);
- if (!fsc->debugfs_dentry_lru)
- goto out;
-
return 0;
out:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 82928cea0209..0637149fb9f9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -29,6 +29,9 @@
const struct dentry_operations ceph_dentry_ops;
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di);
+static int __dir_lease_try_check(const struct dentry *dentry);
+
/*
* Initialize ceph dentry state.
*/
@@ -44,7 +47,7 @@ static int ceph_d_init(struct dentry *dentry)
di->lease_session = NULL;
di->time = jiffies;
dentry->d_fsdata = di;
- ceph_dentry_lru_add(dentry);
+ INIT_LIST_HEAD(&di->lease_list);
return 0;
}
@@ -241,6 +244,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
goto out;
}
if (fpos_cmp(ctx->pos, di->offset) <= 0) {
+ __ceph_dentry_dir_lease_touch(di);
emit_dentry = true;
}
spin_unlock(&dentry->d_lock);
@@ -1125,13 +1129,277 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
}
/*
+ * Move dentry to tail of mdsc->dentry_leases list when lease is updated.
+ * Leases at front of the list will expire first. (Assume all leases have
+ * similar duration)
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
+{
+ struct dentry *dn = di->dentry;
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn);
+
+ di->flags |= CEPH_DENTRY_LEASE_LIST;
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ return;
+ }
+
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_move_tail(&di->lease_list, &mdsc->dentry_leases);
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
+ struct ceph_dentry_info *di)
+{
+ di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED);
+ di->lease_gen = 0;
+ di->time = jiffies;
+ list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases);
+}
+
+/*
+ * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases
+ * list if it's not in the list, otherwise set 'referenced' flag.
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
+{
+ struct dentry *dn = di->dentry;
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n",
+ di, dn, dn, di->offset);
+
+ if (!list_empty(&di->lease_list)) {
+ if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+ /* don't remove dentry from dentry lease list
+ * if its lease is valid */
+ if (__dentry_lease_is_valid(di))
+ return;
+ } else {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ return;
+ }
+ }
+
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ di->flags &= ~CEPH_DENTRY_LEASE_LIST;
+ return;
+ }
+
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ __dentry_dir_lease_touch(mdsc, di),
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_lease_unlist(struct ceph_dentry_info *di)
+{
+ struct ceph_mds_client *mdsc;
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST)
+ return;
+ if (list_empty(&di->lease_list))
+ return;
+
+ mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_del_init(&di->lease_list);
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+enum {
+ KEEP = 0,
+ DELETE = 1,
+ TOUCH = 2,
+ STOP = 4,
+};
+
+struct ceph_lease_walk_control {
+ bool dir_lease;
+ bool expire_dir_lease;
+ unsigned long nr_to_scan;
+ unsigned long dir_lease_ttl;
+};
+
+static unsigned long
+__dentry_leases_walk(struct ceph_mds_client *mdsc,
+ struct ceph_lease_walk_control *lwc,
+ int (*check)(struct dentry*, void*))
+{
+ struct ceph_dentry_info *di, *tmp;
+ struct dentry *dentry, *last = NULL;
+ struct list_head* list;
+ LIST_HEAD(dispose);
+ unsigned long freed = 0;
+ int ret = 0;
+
+ list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_for_each_entry_safe(di, tmp, list, lease_list) {
+ if (!lwc->nr_to_scan)
+ break;
+ --lwc->nr_to_scan;
+
+ dentry = di->dentry;
+ if (last == dentry)
+ break;
+
+ if (!spin_trylock(&dentry->d_lock))
+ continue;
+
+ if (dentry->d_lockref.count < 0) {
+ list_del_init(&di->lease_list);
+ goto next;
+ }
+
+ ret = check(dentry, lwc);
+ if (ret & TOUCH) {
+ /* move it into tail of dir lease list */
+ __dentry_dir_lease_touch(mdsc, di);
+ if (!last)
+ last = dentry;
+ }
+ if (ret & DELETE) {
+ /* stale lease */
+ di->flags &= ~CEPH_DENTRY_REFERENCED;
+ if (dentry->d_lockref.count > 0) {
+ /* update_dentry_lease() will re-add
+ * it to lease list, or
+ * ceph_d_delete() will return 1 when
+ * last reference is dropped */
+ list_del_init(&di->lease_list);
+ } else {
+ di->flags |= CEPH_DENTRY_SHRINK_LIST;
+ list_move_tail(&di->lease_list, &dispose);
+ dget_dlock(dentry);
+ }
+ }
+next:
+ spin_unlock(&dentry->d_lock);
+ if (ret & STOP)
+ break;
+ }
+ spin_unlock(&mdsc->dentry_list_lock);
+
+ while (!list_empty(&dispose)) {
+ di = list_first_entry(&dispose, struct ceph_dentry_info,
+ lease_list);
+ dentry = di->dentry;
+ spin_lock(&dentry->d_lock);
+
+ list_del_init(&di->lease_list);
+ di->flags &= ~CEPH_DENTRY_SHRINK_LIST;
+ if (di->flags & CEPH_DENTRY_REFERENCED) {
+ spin_lock(&mdsc->dentry_list_lock);
+ if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+ list_add_tail(&di->lease_list,
+ &mdsc->dentry_leases);
+ } else {
+ __dentry_dir_lease_touch(mdsc, di);
+ }
+ spin_unlock(&mdsc->dentry_list_lock);
+ } else {
+ freed++;
+ }
+
+ spin_unlock(&dentry->d_lock);
+ /* ceph_d_delete() does the trick */
+ dput(dentry);
+ }
+ return freed;
+}
+
+static int __dentry_lease_check(struct dentry *dentry, void *arg)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int ret;
+
+ if (__dentry_lease_is_valid(di))
+ return STOP;
+ ret = __dir_lease_try_check(dentry);
+ if (ret == -EBUSY)
+ return KEEP;
+ if (ret > 0)
+ return TOUCH;
+ return DELETE;
+}
+
+static int __dir_lease_check(struct dentry *dentry, void *arg)
+{
+ struct ceph_lease_walk_control *lwc = arg;
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ int ret = __dir_lease_try_check(dentry);
+ if (ret == -EBUSY)
+ return KEEP;
+ if (ret > 0) {
+ if (time_before(jiffies, di->time + lwc->dir_lease_ttl))
+ return STOP;
+ /* Move dentry to tail of dir lease list if we don't want
+ * to delete it. So dentries in the list are checked in a
+ * round robin manner */
+ if (!lwc->expire_dir_lease)
+ return TOUCH;
+ if (dentry->d_lockref.count > 0 ||
+ (di->flags & CEPH_DENTRY_REFERENCED))
+ return TOUCH;
+ /* invalidate dir lease */
+ di->lease_shared_gen = 0;
+ }
+ return DELETE;
+}
+
+int ceph_trim_dentries(struct ceph_mds_client *mdsc)
+{
+ struct ceph_lease_walk_control lwc;
+ unsigned long count;
+ unsigned long freed;
+
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_count > mdsc->caps_use_max)
+ count = mdsc->caps_use_count - mdsc->caps_use_max;
+ else
+ count = 0;
+ spin_unlock(&mdsc->caps_list_lock);
+
+ lwc.dir_lease = false;
+ lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2;
+ freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
+ if (!lwc.nr_to_scan) /* more invalid leases */
+ return -EAGAIN;
+
+ if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE)
+ lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE;
+
+ lwc.dir_lease = true;
+ lwc.expire_dir_lease = freed < count;
+ lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
+ freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
+ if (!lwc.nr_to_scan) /* more to check */
+ return -EAGAIN;
+
+ return freed > 0 ? 1 : 0;
+}
+
+/*
* Ensure a dentry lease will no longer revalidate.
*/
void ceph_invalidate_dentry_lease(struct dentry *dentry)
{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&dentry->d_lock);
- ceph_dentry(dentry)->time = jiffies;
- ceph_dentry(dentry)->lease_shared_gen = 0;
+ di->time = jiffies;
+ di->lease_shared_gen = 0;
+ __dentry_lease_unlist(di);
spin_unlock(&dentry->d_lock);
}
@@ -1139,45 +1407,59 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
* Check if dentry lease is valid. If not, delete the lease. Try to
* renew if the least is more than half up.
*/
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
+{
+ struct ceph_mds_session *session;
+
+ if (!di->lease_gen)
+ return false;
+
+ session = di->lease_session;
+ if (session) {
+ u32 gen;
+ unsigned long ttl;
+
+ spin_lock(&session->s_gen_ttl_lock);
+ gen = session->s_cap_gen;
+ ttl = session->s_cap_ttl;
+ spin_unlock(&session->s_gen_ttl_lock);
+
+ if (di->lease_gen == gen &&
+ time_before(jiffies, ttl) &&
+ time_before(jiffies, di->time))
+ return true;
+ }
+ di->lease_gen = 0;
+ return false;
+}
+
static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
struct inode *dir)
{
struct ceph_dentry_info *di;
- struct ceph_mds_session *s;
- int valid = 0;
- u32 gen;
- unsigned long ttl;
struct ceph_mds_session *session = NULL;
u32 seq = 0;
+ int valid = 0;
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di && di->lease_session) {
- s = di->lease_session;
- spin_lock(&s->s_gen_ttl_lock);
- gen = s->s_cap_gen;
- ttl = s->s_cap_ttl;
- spin_unlock(&s->s_gen_ttl_lock);
+ if (di && __dentry_lease_is_valid(di)) {
+ valid = 1;
- if (di->lease_gen == gen &&
- time_before(jiffies, di->time) &&
- time_before(jiffies, ttl)) {
- valid = 1;
- if (di->lease_renew_after &&
- time_after(jiffies, di->lease_renew_after)) {
- /*
- * We should renew. If we're in RCU walk mode
- * though, we can't do that so just return
- * -ECHILD.
- */
- if (flags & LOOKUP_RCU) {
- valid = -ECHILD;
- } else {
- session = ceph_get_mds_session(s);
- seq = di->lease_seq;
- di->lease_renew_after = 0;
- di->lease_renew_from = jiffies;
- }
+ if (di->lease_renew_after &&
+ time_after(jiffies, di->lease_renew_after)) {
+ /*
+ * We should renew. If we're in RCU walk mode
+ * though, we can't do that so just return
+ * -ECHILD.
+ */
+ if (flags & LOOKUP_RCU) {
+ valid = -ECHILD;
+ } else {
+ session = ceph_get_mds_session(di->lease_session);
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ di->lease_renew_from = jiffies;
}
}
}
@@ -1193,6 +1475,38 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
}
/*
+ * Called under dentry->d_lock.
+ */
+static int __dir_lease_try_check(const struct dentry *dentry)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct inode *dir;
+ struct ceph_inode_info *ci;
+ int valid = 0;
+
+ if (!di->lease_shared_gen)
+ return 0;
+ if (IS_ROOT(dentry))
+ return 0;
+
+ dir = d_inode(dentry->d_parent);
+ ci = ceph_inode(dir);
+
+ if (spin_trylock(&ci->i_ceph_lock)) {
+ if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0))
+ valid = 1;
+ spin_unlock(&ci->i_ceph_lock);
+ } else {
+ valid = -EBUSY;
+ }
+
+ if (!valid)
+ di->lease_shared_gen = 0;
+ return valid;
+}
+
+/*
* Check if directory-wide content lease/cap is valid.
*/
static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
@@ -1205,6 +1519,8 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen)
valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
spin_unlock(&ci->i_ceph_lock);
+ if (valid)
+ __ceph_dentry_dir_lease_touch(di);
dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
dir, (unsigned)atomic_read(&ci->i_shared_gen),
dentry, (unsigned)di->lease_shared_gen, valid);
@@ -1297,11 +1613,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
- if (valid) {
- ceph_dentry_lru_touch(dentry);
- } else {
+ if (!valid)
ceph_dir_clear_complete(dir);
- }
if (!(flags & LOOKUP_RCU))
dput(parent);
@@ -1309,6 +1622,31 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
/*
+ * Delete unused dentry that doesn't have valid lease
+ *
+ * Called under dentry->d_lock.
+ */
+static int ceph_d_delete(const struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+
+ /* won't release caps */
+ if (d_really_is_negative(dentry))
+ return 0;
+ if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
+ return 0;
+ /* vaild lease? */
+ di = ceph_dentry(dentry);
+ if (di) {
+ if (__dentry_lease_is_valid(di))
+ return 0;
+ if (__dir_lease_try_check(dentry))
+ return 0;
+ }
+ return 1;
+}
+
+/*
* Release our ceph_dentry_info.
*/
static void ceph_d_release(struct dentry *dentry)
@@ -1316,9 +1654,9 @@ static void ceph_d_release(struct dentry *dentry)
struct ceph_dentry_info *di = ceph_dentry(dentry);
dout("d_release %p\n", dentry);
- ceph_dentry_lru_del(dentry);
spin_lock(&dentry->d_lock);
+ __dentry_lease_unlist(di);
dentry->d_fsdata = NULL;
spin_unlock(&dentry->d_lock);
@@ -1419,49 +1757,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
return size - left;
}
-/*
- * We maintain a private dentry LRU.
- *
- * FIXME: this needs to be changed to a per-mds lru to be useful.
- */
-void ceph_dentry_lru_add(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_add_tail(&di->lru, &mdsc->dentry_lru);
- mdsc->num_dentry++;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-
-void ceph_dentry_lru_touch(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn,
- di->offset);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_move_tail(&di->lru, &mdsc->dentry_lru);
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-void ceph_dentry_lru_del(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_del_init(&di->lru);
- mdsc->num_dentry--;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
/*
* Return name hash for a given dentry. This is dependent on
@@ -1470,6 +1766,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
{
struct ceph_inode_info *dci = ceph_inode(dir);
+ unsigned hash;
switch (dci->i_dir_layout.dl_dir_hash) {
case 0: /* for backward compat */
@@ -1477,8 +1774,11 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
return dn->d_name.hash;
default:
- return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+ spin_lock(&dn->d_lock);
+ hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
dn->d_name.name, dn->d_name.len);
+ spin_unlock(&dn->d_lock);
+ return hash;
}
}
@@ -1531,6 +1831,7 @@ const struct inode_operations ceph_snapdir_iops = {
const struct dentry_operations ceph_dentry_ops = {
.d_revalidate = ceph_d_revalidate,
+ .d_delete = ceph_d_delete,
.d_release = ceph_d_release,
.d_prune = ceph_d_prune,
.d_init = ceph_d_init,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 189df668b6a0..9f53c3d99304 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -590,7 +590,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
* but it will at least behave sensibly when they are
* in sequence.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ off, off + len - 1);
if (ret < 0)
return ret;
@@ -929,14 +930,15 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
(write ? "write" : "read"), file, pos, (unsigned)count,
snapc, snapc->seq);
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ pos, pos + count - 1);
if (ret < 0)
return ret;
if (write) {
int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
- (pos + count) >> PAGE_SHIFT);
+ (pos + count - 1) >> PAGE_SHIFT);
if (ret2 < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret2);
@@ -1132,13 +1134,14 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
file, pos, (unsigned)count, snapc, snapc->seq);
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ pos, pos + count - 1);
if (ret < 0)
return ret;
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
- (pos + count) >> PAGE_SHIFT);
+ (pos + count - 1) >> PAGE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9d1f34d46627..c2feb310ac1e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -497,7 +497,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
atomic_set(&ci->i_filelock_ref, 0);
- atomic_set(&ci->i_shared_gen, 0);
+ atomic_set(&ci->i_shared_gen, 1);
ci->i_rdcache_gen = 0;
ci->i_rdcache_revoking = 0;
@@ -524,6 +524,7 @@ static void ceph_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct ceph_inode_info *ci = ceph_inode(inode);
+ kfree(ci->i_symlink);
kmem_cache_free(ceph_inode_cachep, ci);
}
@@ -537,7 +538,7 @@ void ceph_destroy_inode(struct inode *inode)
ceph_fscache_unregister_inode_cookie(ci);
- ceph_queue_caps_release(inode);
+ __ceph_remove_caps(inode);
if (__ceph_has_any_quota(ci))
ceph_adjust_quota_realms_count(inode, false);
@@ -548,20 +549,24 @@ void ceph_destroy_inode(struct inode *inode)
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct ceph_snap_realm *realm = ci->i_snap_realm;
-
- dout(" dropping residual ref to snap realm %p\n", realm);
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm = NULL;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
+ ceph_inode_to_client(inode)->mdsc;
+ if (ceph_snap(inode) == CEPH_NOSNAP) {
+ struct ceph_snap_realm *realm = ci->i_snap_realm;
+ dout(" dropping residual ref to snap realm %p\n",
+ realm);
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ ci->i_snap_realm = NULL;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ ceph_put_snap_realm(mdsc, realm);
+ } else {
+ ceph_put_snapid_map(mdsc, ci->i_snapid_map);
+ ci->i_snap_realm = NULL;
+ }
}
- kfree(ci->i_symlink);
while ((n = rb_first(&ci->i_fragtree)) != NULL) {
frag = rb_entry(n, struct ceph_inode_frag, node);
rb_erase(n, &ci->i_fragtree);
@@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
iinfo->pool_ns_len);
+ if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
+ ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
+
spin_lock(&ci->i_ceph_lock);
/*
@@ -869,6 +877,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
ci->i_rbytes = le64_to_cpu(info->rbytes);
ci->i_rfiles = le64_to_cpu(info->rfiles);
ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+ ci->i_dir_pin = iinfo->dir_pin;
ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
}
}
@@ -899,6 +908,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
case S_IFBLK:
case S_IFCHR:
case S_IFSOCK:
+ inode->i_blkbits = PAGE_SHIFT;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
inode->i_op = &ceph_file_iops;
break;
@@ -1066,9 +1076,10 @@ static void update_dentry_lease(struct dentry *dentry,
goto out_unlock;
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
-
- if (duration == 0)
+ if (duration == 0) {
+ __ceph_dentry_dir_lease_touch(di);
goto out_unlock;
+ }
if (di->lease_gen == session->s_cap_gen &&
time_before(ttl, di->time))
@@ -1079,8 +1090,6 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_session = NULL;
}
- ceph_dentry_lru_touch(dentry);
-
if (!di->lease_session)
di->lease_session = ceph_get_mds_session(session);
di->lease_gen = session->s_cap_gen;
@@ -1088,6 +1097,8 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
di->time = ttl;
+
+ __ceph_dentry_lease_touch(di);
out_unlock:
spin_unlock(&dentry->d_lock);
if (old_lease_session)
@@ -1152,6 +1163,19 @@ static int splice_dentry(struct dentry **pdn, struct inode *in)
return 0;
}
+static int d_name_cmp(struct dentry *dentry, const char *name, size_t len)
+{
+ int ret;
+
+ /* take d_lock to ensure dentry->d_name stability */
+ spin_lock(&dentry->d_lock);
+ ret = dentry->d_name.len - len;
+ if (!ret)
+ ret = memcmp(dentry->d_name.name, name, len);
+ spin_unlock(&dentry->d_lock);
+ return ret;
+}
+
/*
* Incorporate results into the local cache. This is either just
* one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
@@ -1401,7 +1425,8 @@ retry_lookup:
err = splice_dentry(&req->r_dentry, in);
if (err < 0)
goto done;
- } else if (rinfo->head->is_dentry) {
+ } else if (rinfo->head->is_dentry &&
+ !d_name_cmp(req->r_dentry, rinfo->dname, rinfo->dname_len)) {
struct ceph_vino *ptvino = NULL;
if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) ||
@@ -2259,10 +2284,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
if (!err) {
generic_fillattr(inode, stat);
stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
- if (ceph_snap(inode) != CEPH_NOSNAP)
- stat->dev = ceph_snap(inode);
+ if (ceph_snap(inode) == CEPH_NOSNAP)
+ stat->dev = inode->i_sb->s_dev;
else
- stat->dev = 0;
+ stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
+
if (S_ISDIR(inode->i_mode)) {
if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
RBYTES))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 163fc74bf221..9049c2a3e972 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -20,6 +20,8 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
+#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
+
/*
* A cluster of MDS (metadata server) daemons is responsible for
* managing the file system namespace (the directory hierarchy and
@@ -46,13 +48,17 @@
*/
struct ceph_reconnect_state {
- int nr_caps;
+ struct ceph_mds_session *session;
+ int nr_caps, nr_realms;
struct ceph_pagelist *pagelist;
unsigned msg_version;
+ bool allow_multi;
};
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head);
+static void ceph_cap_release_work(struct work_struct *work);
+static void ceph_cap_reclaim_work(struct work_struct *work);
static const struct ceph_connection_operations mds_con_ops;
@@ -61,6 +67,29 @@ static const struct ceph_connection_operations mds_con_ops;
* mds reply parsing
*/
+static int parse_reply_info_quota(void **p, void *end,
+ struct ceph_mds_reply_info_in *info)
+{
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only
+ * understand encoding with struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
+ ceph_decode_64_safe(p, end, info->max_bytes, bad);
+ ceph_decode_64_safe(p, end, info->max_files, bad);
+ *p = end;
+ return 0;
+bad:
+ return -EIO;
+}
+
/*
* parse individual inode info
*/
@@ -68,8 +97,24 @@ static int parse_reply_info_in(void **p, void *end,
struct ceph_mds_reply_info_in *info,
u64 features)
{
- int err = -EIO;
+ int err = 0;
+ u8 struct_v = 0;
+ if (features == (u64)-1) {
+ u32 struct_len;
+ u8 struct_compat;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding with struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
+ }
+
+ ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);
info->in = *p;
*p += sizeof(struct ceph_mds_reply_inode) +
sizeof(*info->in->fragtree.splits) *
@@ -87,49 +132,136 @@ static int parse_reply_info_in(void **p, void *end,
info->xattr_data = *p;
*p += info->xattr_len;
- if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+ if (features == (u64)-1) {
+ /* inline data */
ceph_decode_64_safe(p, end, info->inline_version, bad);
ceph_decode_32_safe(p, end, info->inline_len, bad);
ceph_decode_need(p, end, info->inline_len, bad);
info->inline_data = *p;
*p += info->inline_len;
- } else
- info->inline_version = CEPH_INLINE_NONE;
+ /* quota */
+ err = parse_reply_info_quota(p, end, info);
+ if (err < 0)
+ goto out_bad;
+ /* pool namespace */
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
+ /* btime, change_attr */
+ {
+ struct ceph_timespec btime;
+ u64 change_attr;
+ ceph_decode_need(p, end, sizeof(btime), bad);
+ ceph_decode_copy(p, &btime, sizeof(btime));
+ ceph_decode_64_safe(p, end, change_attr, bad);
+ }
+
+ /* dir pin */
+ if (struct_v >= 2) {
+ ceph_decode_32_safe(p, end, info->dir_pin, bad);
+ } else {
+ info->dir_pin = -ENODATA;
+ }
+
+ *p = end;
+ } else {
+ if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+ ceph_decode_64_safe(p, end, info->inline_version, bad);
+ ceph_decode_32_safe(p, end, info->inline_len, bad);
+ ceph_decode_need(p, end, info->inline_len, bad);
+ info->inline_data = *p;
+ *p += info->inline_len;
+ } else
+ info->inline_version = CEPH_INLINE_NONE;
+
+ if (features & CEPH_FEATURE_MDS_QUOTA) {
+ err = parse_reply_info_quota(p, end, info);
+ if (err < 0)
+ goto out_bad;
+ } else {
+ info->max_bytes = 0;
+ info->max_files = 0;
+ }
+
+ info->pool_ns_len = 0;
+ info->pool_ns_data = NULL;
+ if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
+ }
+
+ info->dir_pin = -ENODATA;
+ }
+ return 0;
+bad:
+ err = -EIO;
+out_bad:
+ return err;
+}
- if (features & CEPH_FEATURE_MDS_QUOTA) {
+static int parse_reply_info_dir(void **p, void *end,
+ struct ceph_mds_reply_dirfrag **dirfrag,
+ u64 features)
+{
+ if (features == (u64)-1) {
u8 struct_v, struct_compat;
u32 struct_len;
-
- /*
- * both struct_v and struct_compat are expected to be >= 1
- */
ceph_decode_8_safe(p, end, struct_v, bad);
ceph_decode_8_safe(p, end, struct_compat, bad);
- if (!struct_v || !struct_compat)
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding whose struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
goto bad;
ceph_decode_32_safe(p, end, struct_len, bad);
ceph_decode_need(p, end, struct_len, bad);
- ceph_decode_64_safe(p, end, info->max_bytes, bad);
- ceph_decode_64_safe(p, end, info->max_files, bad);
- } else {
- info->max_bytes = 0;
- info->max_files = 0;
+ end = *p + struct_len;
}
- info->pool_ns_len = 0;
- info->pool_ns_data = NULL;
- if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
- ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
- if (info->pool_ns_len > 0) {
- ceph_decode_need(p, end, info->pool_ns_len, bad);
- info->pool_ns_data = *p;
- *p += info->pool_ns_len;
- }
+ ceph_decode_need(p, end, sizeof(**dirfrag), bad);
+ *dirfrag = *p;
+ *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);
+ if (unlikely(*p > end))
+ goto bad;
+ if (features == (u64)-1)
+ *p = end;
+ return 0;
+bad:
+ return -EIO;
+}
+
+static int parse_reply_info_lease(void **p, void *end,
+ struct ceph_mds_reply_lease **lease,
+ u64 features)
+{
+ if (features == (u64)-1) {
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding whose struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
}
+ ceph_decode_need(p, end, sizeof(**lease), bad);
+ *lease = *p;
+ *p += sizeof(**lease);
+ if (features == (u64)-1)
+ *p = end;
return 0;
bad:
- return err;
+ return -EIO;
}
/*
@@ -147,20 +279,18 @@ static int parse_reply_info_trace(void **p, void *end,
if (err < 0)
goto out_bad;
- if (unlikely(*p + sizeof(*info->dirfrag) > end))
- goto bad;
- info->dirfrag = *p;
- *p += sizeof(*info->dirfrag) +
- sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
- if (unlikely(*p > end))
- goto bad;
+ err = parse_reply_info_dir(p, end, &info->dirfrag, features);
+ if (err < 0)
+ goto out_bad;
ceph_decode_32_safe(p, end, info->dname_len, bad);
ceph_decode_need(p, end, info->dname_len, bad);
info->dname = *p;
*p += info->dname_len;
- info->dlease = *p;
- *p += sizeof(*info->dlease);
+
+ err = parse_reply_info_lease(p, end, &info->dlease, features);
+ if (err < 0)
+ goto out_bad;
}
if (info->head->is_target) {
@@ -183,20 +313,16 @@ out_bad:
/*
* parse readdir results
*/
-static int parse_reply_info_dir(void **p, void *end,
+static int parse_reply_info_readdir(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
u32 num, i = 0;
int err;
- info->dir_dir = *p;
- if (*p + sizeof(*info->dir_dir) > end)
- goto bad;
- *p += sizeof(*info->dir_dir) +
- sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
- if (*p > end)
- goto bad;
+ err = parse_reply_info_dir(p, end, &info->dir_dir, features);
+ if (err < 0)
+ goto out_bad;
ceph_decode_need(p, end, sizeof(num) + 2, bad);
num = ceph_decode_32(p);
@@ -222,15 +348,16 @@ static int parse_reply_info_dir(void **p, void *end,
while (num) {
struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
/* dentry */
- ceph_decode_need(p, end, sizeof(u32)*2, bad);
- rde->name_len = ceph_decode_32(p);
+ ceph_decode_32_safe(p, end, rde->name_len, bad);
ceph_decode_need(p, end, rde->name_len, bad);
rde->name = *p;
*p += rde->name_len;
dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
- rde->lease = *p;
- *p += sizeof(struct ceph_mds_reply_lease);
+ /* dentry lease */
+ err = parse_reply_info_lease(p, end, &rde->lease, features);
+ if (err)
+ goto out_bad;
/* inode */
err = parse_reply_info_in(p, end, &rde->inode, features);
if (err < 0)
@@ -281,7 +408,8 @@ static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
- if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+ if (features == (u64)-1 ||
+ (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
if (*p == end) {
info->has_create_ino = false;
} else {
@@ -310,7 +438,7 @@ static int parse_reply_info_extra(void **p, void *end,
if (op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
- return parse_reply_info_dir(p, end, info, features);
+ return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features);
else
@@ -494,7 +622,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
spin_lock_init(&s->s_gen_ttl_lock);
- s->s_cap_gen = 0;
+ s->s_cap_gen = 1;
s->s_cap_ttl = jiffies - 1;
spin_lock_init(&s->s_cap_lock);
@@ -510,6 +638,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
+ INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
+
INIT_LIST_HEAD(&s->s_cap_flushing);
mdsc->sessions[mds] = s;
@@ -535,6 +665,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
dout("__unregister_session mds%d %p\n", s->s_mds, s);
BUG_ON(mdsc->sessions[s->s_mds] != s);
mdsc->sessions[s->s_mds] = NULL;
+ s->s_state = 0;
ceph_con_close(&s->s_con);
ceph_put_mds_session(s);
atomic_dec(&mdsc->num_sessions);
@@ -1197,13 +1328,10 @@ static int iterate_session_caps(struct ceph_mds_session *session,
cap->session = NULL;
list_del_init(&cap->session_caps);
session->s_nr_caps--;
- if (cap->queue_release) {
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
- } else {
+ if (cap->queue_release)
+ __ceph_queue_cap_release(session, cap);
+ else
old_cap = cap; /* put_cap it w/o locks held */
- }
}
if (ret < 0)
goto out;
@@ -1286,6 +1414,15 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
ci->i_prealloc_cap_flush = NULL;
}
+
+ if (drop &&
+ ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
}
spin_unlock(&ci->i_ceph_lock);
while (!list_empty(&to_remove)) {
@@ -1638,7 +1775,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
session->s_trim_caps = 0;
}
- ceph_send_cap_releases(mdsc, session);
+ ceph_flush_cap_releases(mdsc, session);
return 0;
}
@@ -1681,8 +1818,8 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc,
/*
* called under s_mutex
*/
-void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
{
struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head;
@@ -1774,6 +1911,81 @@ out_err:
spin_unlock(&session->s_cap_lock);
}
+static void ceph_cap_release_work(struct work_struct *work)
+{
+ struct ceph_mds_session *session =
+ container_of(work, struct ceph_mds_session, s_cap_release_work);
+
+ mutex_lock(&session->s_mutex);
+ if (session->s_state == CEPH_MDS_SESSION_OPEN ||
+ session->s_state == CEPH_MDS_SESSION_HUNG)
+ ceph_send_cap_releases(session->s_mdsc, session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+}
+
+void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ if (mdsc->stopping)
+ return;
+
+ get_session(session);
+ if (queue_work(mdsc->fsc->cap_wq,
+ &session->s_cap_release_work)) {
+ dout("cap release work queued\n");
+ } else {
+ ceph_put_mds_session(session);
+ dout("failed to queue cap release work\n");
+ }
+}
+
+/*
+ * caller holds session->s_cap_lock
+ */
+void __ceph_queue_cap_release(struct ceph_mds_session *session,
+ struct ceph_cap *cap)
+{
+ list_add_tail(&cap->session_caps, &session->s_cap_releases);
+ session->s_num_cap_releases++;
+
+ if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
+ ceph_flush_cap_releases(session->s_mdsc, session);
+}
+
+static void ceph_cap_reclaim_work(struct work_struct *work)
+{
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, cap_reclaim_work);
+ int ret = ceph_trim_dentries(mdsc);
+ if (ret == -EAGAIN)
+ ceph_queue_cap_reclaim_work(mdsc);
+}
+
+void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
+{
+ if (mdsc->stopping)
+ return;
+
+ if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
+ dout("caps reclaim work queued\n");
+ } else {
+ dout("failed to queue caps release work\n");
+ }
+}
+
+void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
+{
+ int val;
+ if (!nr)
+ return;
+ val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
+ if (!(val % CEPH_CAPS_PER_RELEASE)) {
+ atomic_set(&mdsc->cap_reclaim_pending, 0);
+ ceph_queue_cap_reclaim_work(mdsc);
+ }
+}
+
/*
* requests
*/
@@ -1958,10 +2170,39 @@ retry:
return path;
}
+/* Duplicate the dentry->d_name.name safely */
+static int clone_dentry_name(struct dentry *dentry, const char **ppath,
+ int *ppathlen)
+{
+ u32 len;
+ char *name;
+
+retry:
+ len = READ_ONCE(dentry->d_name.len);
+ name = kmalloc(len + 1, GFP_NOFS);
+ if (!name)
+ return -ENOMEM;
+
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_name.len != len) {
+ spin_unlock(&dentry->d_lock);
+ kfree(name);
+ goto retry;
+ }
+ memcpy(name, dentry->d_name.name, len);
+ spin_unlock(&dentry->d_lock);
+
+ name[len] = '\0';
+ *ppath = name;
+ *ppathlen = len;
+ return 0;
+}
+
static int build_dentry_path(struct dentry *dentry, struct inode *dir,
const char **ppath, int *ppathlen, u64 *pino,
- int *pfreepath)
+ bool *pfreepath, bool parent_locked)
{
+ int ret;
char *path;
rcu_read_lock();
@@ -1970,8 +2211,15 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
*pino = ceph_ino(dir);
rcu_read_unlock();
- *ppath = dentry->d_name.name;
- *ppathlen = dentry->d_name.len;
+ if (parent_locked) {
+ *ppath = dentry->d_name.name;
+ *ppathlen = dentry->d_name.len;
+ } else {
+ ret = clone_dentry_name(dentry, ppath, ppathlen);
+ if (ret)
+ return ret;
+ *pfreepath = true;
+ }
return 0;
}
rcu_read_unlock();
@@ -1979,13 +2227,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
if (IS_ERR(path))
return PTR_ERR(path);
*ppath = path;
- *pfreepath = 1;
+ *pfreepath = true;
return 0;
}
static int build_inode_path(struct inode *inode,
const char **ppath, int *ppathlen, u64 *pino,
- int *pfreepath)
+ bool *pfreepath)
{
struct dentry *dentry;
char *path;
@@ -2001,7 +2249,7 @@ static int build_inode_path(struct inode *inode,
if (IS_ERR(path))
return PTR_ERR(path);
*ppath = path;
- *pfreepath = 1;
+ *pfreepath = true;
return 0;
}
@@ -2012,7 +2260,7 @@ static int build_inode_path(struct inode *inode,
static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
struct inode *rdiri, const char *rpath,
u64 rino, const char **ppath, int *pathlen,
- u64 *ino, int *freepath)
+ u64 *ino, bool *freepath, bool parent_locked)
{
int r = 0;
@@ -2022,7 +2270,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
ceph_snap(rinode));
} else if (rdentry) {
r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
- freepath);
+ freepath, parent_locked);
dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
*ppath);
} else if (rpath || rino) {
@@ -2048,7 +2296,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
const char *path2 = NULL;
u64 ino1 = 0, ino2 = 0;
int pathlen1 = 0, pathlen2 = 0;
- int freepath1 = 0, freepath2 = 0;
+ bool freepath1 = false, freepath2 = false;
int len;
u16 releases;
void *p, *end;
@@ -2056,16 +2304,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
ret = set_request_path_attr(req->r_inode, req->r_dentry,
req->r_parent, req->r_path1, req->r_ino1.ino,
- &path1, &pathlen1, &ino1, &freepath1);
+ &path1, &pathlen1, &ino1, &freepath1,
+ test_bit(CEPH_MDS_R_PARENT_LOCKED,
+ &req->r_req_flags));
if (ret < 0) {
msg = ERR_PTR(ret);
goto out;
}
+ /* If r_old_dentry is set, then assume that its parent is locked */
ret = set_request_path_attr(NULL, req->r_old_dentry,
req->r_old_dentry_dir,
req->r_path2, req->r_ino2.ino,
- &path2, &pathlen2, &ino2, &freepath2);
+ &path2, &pathlen2, &ino2, &freepath2, true);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out_free1;
@@ -2653,7 +2904,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
dout("handle_reply tid %lld result %d\n", tid, result);
rinfo = &req->r_reply_info;
- err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+ if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
+ err = parse_reply_info(msg, rinfo, (u64)-1);
+ else
+ err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
@@ -2684,7 +2938,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
req->r_op == CEPH_MDS_OP_LSSNAP))
ceph_readdir_prepopulate(req, req->r_session);
- ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
current->journal_info = NULL;
mutex_unlock(&req->r_fill_mutex);
@@ -2693,12 +2946,18 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (realm)
ceph_put_snap_realm(mdsc, realm);
- if (err == 0 && req->r_target_inode &&
- test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
- struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
- spin_unlock(&ci->i_unsafe_lock);
+ if (err == 0) {
+ if (req->r_target_inode &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ struct ceph_inode_info *ci =
+ ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_target_item,
+ &ci->i_unsafe_iops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+
+ ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
out_err:
mutex_lock(&mdsc->mutex);
@@ -2777,6 +3036,25 @@ bad:
pr_err("mdsc_handle_forward decode error err=%d\n", err);
}
+static int __decode_and_drop_session_metadata(void **p, void *end)
+{
+ /* map<string,string> */
+ u32 n;
+ ceph_decode_32_safe(p, end, n, bad);
+ while (n-- > 0) {
+ u32 len;
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+ *p += len;
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+ *p += len;
+ }
+ return 0;
+bad:
+ return -1;
+}
+
/*
* handle a mds session control message
*/
@@ -2784,18 +3062,36 @@ static void handle_session(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
+ int mds = session->s_mds;
+ int msg_version = le16_to_cpu(msg->hdr.version);
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ struct ceph_mds_session_head *h;
u32 op;
u64 seq;
- int mds = session->s_mds;
- struct ceph_mds_session_head *h = msg->front.iov_base;
+ unsigned long features = 0;
int wake = 0;
/* decode */
- if (msg->front.iov_len < sizeof(*h))
- goto bad;
+ ceph_decode_need(&p, end, sizeof(*h), bad);
+ h = p;
+ p += sizeof(*h);
+
op = le32_to_cpu(h->op);
seq = le64_to_cpu(h->seq);
+ if (msg_version >= 3) {
+ u32 len;
+ /* version >= 2, metadata */
+ if (__decode_and_drop_session_metadata(&p, end) < 0)
+ goto bad;
+ /* version >= 3, feature bits */
+ ceph_decode_32_safe(&p, end, len, bad);
+ ceph_decode_need(&p, end, len, bad);
+ memcpy(&features, p, min_t(size_t, len, sizeof(features)));
+ p += len;
+ }
+
mutex_lock(&mdsc->mutex);
if (op == CEPH_SESSION_CLOSE) {
get_session(session);
@@ -2821,6 +3117,7 @@ static void handle_session(struct ceph_mds_session *session,
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect success\n", session->s_mds);
session->s_state = CEPH_MDS_SESSION_OPEN;
+ session->s_features = features;
renewed_caps(mdsc, session, 0);
wake = 1;
if (mdsc->stopping)
@@ -2947,6 +3244,82 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
}
+static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
+{
+ struct ceph_msg *reply;
+ struct ceph_pagelist *_pagelist;
+ struct page *page;
+ __le32 *addr;
+ int err = -ENOMEM;
+
+ if (!recon_state->allow_multi)
+ return -ENOSPC;
+
+ /* can't handle message that contains both caps and realm */
+ BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);
+
+ /* pre-allocate new pagelist */
+ _pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!_pagelist)
+ return -ENOMEM;
+
+ reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
+ if (!reply)
+ goto fail_msg;
+
+ /* placeholder for nr_caps */
+ err = ceph_pagelist_encode_32(_pagelist, 0);
+ if (err < 0)
+ goto fail;
+
+ if (recon_state->nr_caps) {
+ /* currently encoding caps */
+ err = ceph_pagelist_encode_32(recon_state->pagelist, 0);
+ if (err)
+ goto fail;
+ } else {
+ /* placeholder for nr_realms (currently encoding relams) */
+ err = ceph_pagelist_encode_32(_pagelist, 0);
+ if (err < 0)
+ goto fail;
+ }
+
+ err = ceph_pagelist_encode_8(recon_state->pagelist, 1);
+ if (err)
+ goto fail;
+
+ page = list_first_entry(&recon_state->pagelist->head, struct page, lru);
+ addr = kmap_atomic(page);
+ if (recon_state->nr_caps) {
+ /* currently encoding caps */
+ *addr = cpu_to_le32(recon_state->nr_caps);
+ } else {
+ /* currently encoding relams */
+ *(addr + 1) = cpu_to_le32(recon_state->nr_realms);
+ }
+ kunmap_atomic(addr);
+
+ reply->hdr.version = cpu_to_le16(5);
+ reply->hdr.compat_version = cpu_to_le16(4);
+
+ reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);
+ ceph_msg_data_add_pagelist(reply, recon_state->pagelist);
+
+ ceph_con_send(&recon_state->session->s_con, reply);
+ ceph_pagelist_release(recon_state->pagelist);
+
+ recon_state->pagelist = _pagelist;
+ recon_state->nr_caps = 0;
+ recon_state->nr_realms = 0;
+ recon_state->msg_version = 5;
+ return 0;
+fail:
+ ceph_msg_put(reply);
+fail_msg:
+ ceph_pagelist_release(_pagelist);
+ return err;
+}
+
/*
* Encode information about a cap for a reconnect with the MDS.
*/
@@ -2966,9 +3339,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
inode, ceph_vinop(inode), cap, cap->cap_id,
ceph_cap_string(cap->issued));
- err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- if (err)
- return err;
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
@@ -3008,7 +3378,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (recon_state->msg_version >= 2) {
int num_fcntl_locks, num_flock_locks;
struct ceph_filelock *flocks = NULL;
- size_t struct_len, total_len = 0;
+ size_t struct_len, total_len = sizeof(u64);
u8 struct_v = 0;
encode_again:
@@ -3043,7 +3413,7 @@ encode_again:
if (recon_state->msg_version >= 3) {
/* version, compat_version and struct_len */
- total_len = 2 * sizeof(u8) + sizeof(u32);
+ total_len += 2 * sizeof(u8) + sizeof(u32);
struct_v = 2;
}
/*
@@ -3060,12 +3430,19 @@ encode_again:
struct_len += sizeof(u64); /* snap_follows */
total_len += struct_len;
- err = ceph_pagelist_reserve(pagelist, total_len);
- if (err) {
- kfree(flocks);
- goto out_err;
+
+ if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {
+ err = send_reconnect_partial(recon_state);
+ if (err)
+ goto out_freeflocks;
+ pagelist = recon_state->pagelist;
}
+ err = ceph_pagelist_reserve(pagelist, total_len);
+ if (err)
+ goto out_freeflocks;
+
+ ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
if (recon_state->msg_version >= 3) {
ceph_pagelist_encode_8(pagelist, struct_v);
ceph_pagelist_encode_8(pagelist, 1);
@@ -3077,7 +3454,7 @@ encode_again:
num_fcntl_locks, num_flock_locks);
if (struct_v >= 2)
ceph_pagelist_encode_64(pagelist, snap_follows);
-
+out_freeflocks:
kfree(flocks);
} else {
u64 pathbase = 0;
@@ -3098,20 +3475,81 @@ encode_again:
}
err = ceph_pagelist_reserve(pagelist,
- pathlen + sizeof(u32) + sizeof(rec.v1));
+ sizeof(u64) + sizeof(u32) +
+ pathlen + sizeof(rec.v1));
if (err) {
- kfree(path);
- goto out_err;
+ goto out_freepath;
}
+ ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
-
+out_freepath:
kfree(path);
}
- recon_state->nr_caps++;
out_err:
+ if (err >= 0)
+ recon_state->nr_caps++;
+ return err;
+}
+
+static int encode_snap_realms(struct ceph_mds_client *mdsc,
+ struct ceph_reconnect_state *recon_state)
+{
+ struct rb_node *p;
+ struct ceph_pagelist *pagelist = recon_state->pagelist;
+ int err = 0;
+
+ if (recon_state->msg_version >= 4) {
+ err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);
+ if (err < 0)
+ goto fail;
+ }
+
+ /*
+ * snaprealms. we provide mds with the ino, seq (version), and
+ * parent for all of our realms. If the mds has any newer info,
+ * it will tell us.
+ */
+ for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+ struct ceph_snap_realm *realm =
+ rb_entry(p, struct ceph_snap_realm, node);
+ struct ceph_mds_snaprealm_reconnect sr_rec;
+
+ if (recon_state->msg_version >= 4) {
+ size_t need = sizeof(u8) * 2 + sizeof(u32) +
+ sizeof(sr_rec);
+
+ if (pagelist->length + need > RECONNECT_MAX_SIZE) {
+ err = send_reconnect_partial(recon_state);
+ if (err)
+ goto fail;
+ pagelist = recon_state->pagelist;
+ }
+
+ err = ceph_pagelist_reserve(pagelist, need);
+ if (err)
+ goto fail;
+
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
+ }
+
+ dout(" adding snap realm %llx seq %lld parent %llx\n",
+ realm->ino, realm->seq, realm->parent_ino);
+ sr_rec.ino = cpu_to_le64(realm->ino);
+ sr_rec.seq = cpu_to_le64(realm->seq);
+ sr_rec.parent = cpu_to_le64(realm->parent_ino);
+
+ err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+ if (err)
+ goto fail;
+
+ recon_state->nr_realms++;
+ }
+fail:
return err;
}
@@ -3132,18 +3570,17 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_msg *reply;
- struct rb_node *p;
int mds = session->s_mds;
int err = -ENOMEM;
- int s_nr_caps;
- struct ceph_pagelist *pagelist;
- struct ceph_reconnect_state recon_state;
+ struct ceph_reconnect_state recon_state = {
+ .session = session,
+ };
LIST_HEAD(dispose);
pr_info("mds%d reconnect start\n", mds);
- pagelist = ceph_pagelist_alloc(GFP_NOFS);
- if (!pagelist)
+ recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!recon_state.pagelist)
goto fail_nopagelist;
reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
@@ -3187,63 +3624,90 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
/* replay unsafe requests */
replay_unsafe_requests(mdsc, session);
+ ceph_early_kick_flushing_caps(mdsc, session);
+
down_read(&mdsc->snap_rwsem);
- /* traverse this session's caps */
- s_nr_caps = session->s_nr_caps;
- err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
+ /* placeholder for nr_caps */
+ err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
if (err)
goto fail;
- recon_state.nr_caps = 0;
- recon_state.pagelist = pagelist;
- if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
+ if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
recon_state.msg_version = 3;
- else
+ recon_state.allow_multi = true;
+ } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {
+ recon_state.msg_version = 3;
+ } else {
recon_state.msg_version = 2;
+ }
+ /* trsaverse this session's caps */
err = iterate_session_caps(session, encode_caps_cb, &recon_state);
- if (err < 0)
- goto fail;
spin_lock(&session->s_cap_lock);
session->s_cap_reconnect = 0;
spin_unlock(&session->s_cap_lock);
- /*
- * snaprealms. we provide mds with the ino, seq (version), and
- * parent for all of our realms. If the mds has any newer info,
- * it will tell us.
- */
- for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
- struct ceph_snap_realm *realm =
- rb_entry(p, struct ceph_snap_realm, node);
- struct ceph_mds_snaprealm_reconnect sr_rec;
+ if (err < 0)
+ goto fail;
- dout(" adding snap realm %llx seq %lld parent %llx\n",
- realm->ino, realm->seq, realm->parent_ino);
- sr_rec.ino = cpu_to_le64(realm->ino);
- sr_rec.seq = cpu_to_le64(realm->seq);
- sr_rec.parent = cpu_to_le64(realm->parent_ino);
- err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
- if (err)
- goto fail;
+ /* check if all realms can be encoded into current message */
+ if (mdsc->num_snap_realms) {
+ size_t total_len =
+ recon_state.pagelist->length +
+ mdsc->num_snap_realms *
+ sizeof(struct ceph_mds_snaprealm_reconnect);
+ if (recon_state.msg_version >= 4) {
+ /* number of realms */
+ total_len += sizeof(u32);
+ /* version, compat_version and struct_len */
+ total_len += mdsc->num_snap_realms *
+ (2 * sizeof(u8) + sizeof(u32));
+ }
+ if (total_len > RECONNECT_MAX_SIZE) {
+ if (!recon_state.allow_multi) {
+ err = -ENOSPC;
+ goto fail;
+ }
+ if (recon_state.nr_caps) {
+ err = send_reconnect_partial(&recon_state);
+ if (err)
+ goto fail;
+ }
+ recon_state.msg_version = 5;
+ }
}
- reply->hdr.version = cpu_to_le16(recon_state.msg_version);
+ err = encode_snap_realms(mdsc, &recon_state);
+ if (err < 0)
+ goto fail;
- /* raced with cap release? */
- if (s_nr_caps != recon_state.nr_caps) {
- struct page *page = list_first_entry(&pagelist->head,
- struct page, lru);
+ if (recon_state.msg_version >= 5) {
+ err = ceph_pagelist_encode_8(recon_state.pagelist, 0);
+ if (err < 0)
+ goto fail;
+ }
+
+ if (recon_state.nr_caps || recon_state.nr_realms) {
+ struct page *page =
+ list_first_entry(&recon_state.pagelist->head,
+ struct page, lru);
__le32 *addr = kmap_atomic(page);
- *addr = cpu_to_le32(recon_state.nr_caps);
+ if (recon_state.nr_caps) {
+ WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);
+ *addr = cpu_to_le32(recon_state.nr_caps);
+ } else if (recon_state.msg_version >= 4) {
+ *(addr + 1) = cpu_to_le32(recon_state.nr_realms);
+ }
kunmap_atomic(addr);
}
- reply->hdr.data_len = cpu_to_le32(pagelist->length);
- ceph_msg_data_add_pagelist(reply, pagelist);
+ reply->hdr.version = cpu_to_le16(recon_state.msg_version);
+ if (recon_state.msg_version >= 4)
+ reply->hdr.compat_version = cpu_to_le16(4);
- ceph_early_kick_flushing_caps(mdsc, session);
+ reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);
+ ceph_msg_data_add_pagelist(reply, recon_state.pagelist);
ceph_con_send(&session->s_con, reply);
@@ -3254,7 +3718,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
up_read(&mdsc->snap_rwsem);
- ceph_pagelist_release(pagelist);
+ ceph_pagelist_release(recon_state.pagelist);
return;
fail:
@@ -3262,7 +3726,7 @@ fail:
up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
fail_nomsg:
- ceph_pagelist_release(pagelist);
+ ceph_pagelist_release(recon_state.pagelist);
fail_nopagelist:
pr_err("error %d preparing reconnect for mds%d\n", err, mds);
return;
@@ -3580,7 +4044,6 @@ static void delayed_work(struct work_struct *work)
int renew_caps;
dout("mdsc delayed_work\n");
- ceph_check_delayed_caps(mdsc);
mutex_lock(&mdsc->mutex);
renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
@@ -3628,6 +4091,12 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
+ ceph_check_delayed_caps(mdsc);
+
+ ceph_queue_cap_reclaim_work(mdsc);
+
+ ceph_trim_snapid_map(mdsc);
+
schedule_delayed(mdsc);
}
@@ -3660,6 +4129,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
init_rwsem(&mdsc->snap_rwsem);
mdsc->snap_realms = RB_ROOT;
INIT_LIST_HEAD(&mdsc->snap_empty);
+ mdsc->num_snap_realms = 0;
spin_lock_init(&mdsc->snap_empty_lock);
mdsc->last_tid = 0;
mdsc->oldest_tid = 0;
@@ -3677,11 +4147,19 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
mdsc->num_cap_flushing = 0;
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
- spin_lock_init(&mdsc->dentry_lru_lock);
- INIT_LIST_HEAD(&mdsc->dentry_lru);
+ INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+ atomic_set(&mdsc->cap_reclaim_pending, 0);
+
+ spin_lock_init(&mdsc->dentry_list_lock);
+ INIT_LIST_HEAD(&mdsc->dentry_leases);
+ INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
ceph_caps_init(mdsc);
- ceph_adjust_min_caps(mdsc, fsc->min_caps);
+ ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
+
+ spin_lock_init(&mdsc->snapid_map_lock);
+ mdsc->snapid_map_tree = RB_ROOT;
+ INIT_LIST_HEAD(&mdsc->snapid_map_lru);
init_rwsem(&mdsc->pool_perm_rwsem);
mdsc->pool_perm_tree = RB_ROOT;
@@ -3876,8 +4354,10 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
WARN_ON(!list_empty(&mdsc->cap_delay_list));
mutex_unlock(&mdsc->mutex);
+ ceph_cleanup_snapid_map(mdsc);
ceph_cleanup_empty_realms(mdsc);
+ cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
dout("stopped\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 729da155ebf0..50385a481fdb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -21,11 +21,14 @@
#define CEPHFS_FEATURE_REPLY_ENCODING 9
#define CEPHFS_FEATURE_RECLAIM_CLIENT 10
#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11
+#define CEPHFS_FEATURE_MULTI_RECONNECT 12
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \
+ CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
+ CEPHFS_FEATURE_MULTI_RECONNECT, \
}
#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
@@ -65,6 +68,7 @@ struct ceph_mds_reply_info_in {
char *pool_ns_data;
u64 max_bytes;
u64 max_files;
+ s32 dir_pin;
};
struct ceph_mds_reply_dir_entry {
@@ -152,6 +156,7 @@ struct ceph_mds_session {
int s_mds;
int s_state;
unsigned long s_ttl; /* time until mds kills us */
+ unsigned long s_features;
u64 s_seq; /* incoming msg seq # */
struct mutex s_mutex; /* serialize session messages */
@@ -167,19 +172,20 @@ struct ceph_mds_session {
/* protected by s_cap_lock */
spinlock_t s_cap_lock;
struct list_head s_caps; /* all caps issued by this session */
+ struct ceph_cap *s_cap_iterator;
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
int s_cap_reconnect;
int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */
- struct ceph_cap *s_cap_iterator;
+ struct work_struct s_cap_release_work;
/* protected by mutex */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
- refcount_t s_ref;
+ refcount_t s_ref;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
};
@@ -310,6 +316,15 @@ struct ceph_pool_perm {
char pool_ns[];
};
+struct ceph_snapid_map {
+ struct rb_node node;
+ struct list_head lru;
+ atomic_t ref;
+ u64 snap;
+ dev_t dev;
+ unsigned long last_used;
+};
+
/*
* mds client state
*/
@@ -341,6 +356,7 @@ struct ceph_mds_client {
struct rw_semaphore snap_rwsem;
struct rb_root snap_realms;
struct list_head snap_empty;
+ int num_snap_realms;
spinlock_t snap_empty_lock; /* protect snap_empty */
u64 last_tid; /* most recent mds request */
@@ -362,6 +378,9 @@ struct ceph_mds_client {
spinlock_t cap_dirty_lock; /* protects above items */
wait_queue_head_t cap_flushing_wq;
+ struct work_struct cap_reclaim_work;
+ atomic_t cap_reclaim_pending;
+
/*
* Cap reservations
*
@@ -378,13 +397,18 @@ struct ceph_mds_client {
unreserved) */
int caps_total_count; /* total caps allocated */
int caps_use_count; /* in use */
+ int caps_use_max; /* max used caps */
int caps_reserve_count; /* unused, reserved */
int caps_avail_count; /* unused, unreserved */
int caps_min_count; /* keep at least this many
(unreserved) */
- spinlock_t dentry_lru_lock;
- struct list_head dentry_lru;
- int num_dentry;
+ spinlock_t dentry_list_lock;
+ struct list_head dentry_leases; /* fifo list */
+ struct list_head dentry_dir_leases; /* lru list */
+
+ spinlock_t snapid_map_lock;
+ struct rb_root snapid_map_tree;
+ struct list_head snapid_map_lru;
struct rw_semaphore pool_perm_rwsem;
struct rb_root pool_perm_tree;
@@ -438,9 +462,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
-extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
-
+extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
+ struct ceph_cap *cap);
+extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
+extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index f74193da0e09..b26e12cd8ec3 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -3,12 +3,13 @@
#include <linux/sort.h>
#include <linux/slab.h>
-
#include "super.h"
#include "mds_client.h"
-
#include <linux/ceph/decode.h>
+/* unused map expires after 5 minutes */
+#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ)
+
/*
* Snapshots in ceph are driven in large part by cooperation from the
* client. In contrast to local file systems or file servers that
@@ -124,6 +125,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
+ mdsc->num_snap_realms++;
+
dout("create_snap_realm %llx %p\n", realm->ino, realm);
return realm;
}
@@ -175,6 +178,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
+ mdsc->num_snap_realms--;
if (realm->parent) {
list_del_init(&realm->child_item);
@@ -568,7 +572,12 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
old_snapc = NULL;
update_snapc:
- if (ci->i_head_snapc) {
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ci->i_head_snapc = NULL;
+ } else {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
dout(" new snapc is %p\n", new_snapc);
}
@@ -986,3 +995,154 @@ out:
up_write(&mdsc->snap_rwsem);
return;
}
+
+struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+ u64 snap)
+{
+ struct ceph_snapid_map *sm, *exist;
+ struct rb_node **p, *parent;
+ int ret;
+
+ exist = NULL;
+ spin_lock(&mdsc->snapid_map_lock);
+ p = &mdsc->snapid_map_tree.rb_node;
+ while (*p) {
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
+ if (snap > exist->snap) {
+ p = &(*p)->rb_left;
+ } else if (snap < exist->snap) {
+ p = &(*p)->rb_right;
+ } else {
+ if (atomic_inc_return(&exist->ref) == 1)
+ list_del_init(&exist->lru);
+ break;
+ }
+ exist = NULL;
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+ if (exist) {
+ dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ return exist;
+ }
+
+ sm = kmalloc(sizeof(*sm), GFP_NOFS);
+ if (!sm)
+ return NULL;
+
+ ret = get_anon_bdev(&sm->dev);
+ if (ret < 0) {
+ kfree(sm);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&sm->lru);
+ atomic_set(&sm->ref, 1);
+ sm->snap = snap;
+
+ exist = NULL;
+ parent = NULL;
+ p = &mdsc->snapid_map_tree.rb_node;
+ spin_lock(&mdsc->snapid_map_lock);
+ while (*p) {
+ parent = *p;
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
+ if (snap > exist->snap)
+ p = &(*p)->rb_left;
+ else if (snap < exist->snap)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ if (exist) {
+ if (atomic_inc_return(&exist->ref) == 1)
+ list_del_init(&exist->lru);
+ } else {
+ rb_link_node(&sm->node, parent, p);
+ rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+ if (exist) {
+ free_anon_bdev(sm->dev);
+ kfree(sm);
+ dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ return exist;
+ }
+
+ dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+ return sm;
+}
+
+void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+ struct ceph_snapid_map *sm)
+{
+ if (!sm)
+ return;
+ if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
+ if (!RB_EMPTY_NODE(&sm->node)) {
+ sm->last_used = jiffies;
+ list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
+ spin_unlock(&mdsc->snapid_map_lock);
+ } else {
+ /* already cleaned up by
+ * ceph_cleanup_snapid_map() */
+ spin_unlock(&mdsc->snapid_map_lock);
+ kfree(sm);
+ }
+ }
+}
+
+void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
+{
+ struct ceph_snapid_map *sm;
+ unsigned long now;
+ LIST_HEAD(to_free);
+
+ spin_lock(&mdsc->snapid_map_lock);
+ now = jiffies;
+
+ while (!list_empty(&mdsc->snapid_map_lru)) {
+ sm = list_first_entry(&mdsc->snapid_map_lru,
+ struct ceph_snapid_map, lru);
+ if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now))
+ break;
+
+ rb_erase(&sm->node, &mdsc->snapid_map_tree);
+ list_move(&sm->lru, &to_free);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+
+ while (!list_empty(&to_free)) {
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+ list_del(&sm->lru);
+ dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
+ free_anon_bdev(sm->dev);
+ kfree(sm);
+ }
+}
+
+void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
+{
+ struct ceph_snapid_map *sm;
+ struct rb_node *p;
+ LIST_HEAD(to_free);
+
+ spin_lock(&mdsc->snapid_map_lock);
+ while ((p = rb_first(&mdsc->snapid_map_tree))) {
+ sm = rb_entry(p, struct ceph_snapid_map, node);
+ rb_erase(p, &mdsc->snapid_map_tree);
+ RB_CLEAR_NODE(p);
+ list_move(&sm->lru, &to_free);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+
+ while (!list_empty(&to_free)) {
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+ list_del(&sm->lru);
+ free_anon_bdev(sm->dev);
+ if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
+ pr_err("snapid map %llx -> %x still in use\n",
+ sm->snap, sm->dev);
+ }
+ }
+}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index da2cd8e89062..6d5bb2f74612 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -133,6 +133,7 @@ enum {
Opt_rasize,
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
+ Opt_caps_max,
Opt_readdir_max_entries,
Opt_readdir_max_bytes,
Opt_congestion_kb,
@@ -175,6 +176,7 @@ static match_table_t fsopt_tokens = {
{Opt_rasize, "rasize=%d"},
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+ {Opt_caps_max, "caps_max=%d"},
{Opt_readdir_max_entries, "readdir_max_entries=%d"},
{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
{Opt_congestion_kb, "write_congestion_kb=%d"},
@@ -286,6 +288,11 @@ static int parse_fsopt_token(char *c, void *private)
return -EINVAL;
fsopt->caps_wanted_delay_max = intval;
break;
+ case Opt_caps_max:
+ if (intval < 0)
+ return -EINVAL;
+ fsopt->caps_max = intval;
+ break;
case Opt_readdir_max_entries:
if (intval < 1)
return -EINVAL;
@@ -576,6 +583,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",rasize=%d", fsopt->rasize);
if (fsopt->congestion_kb != default_congestion_kb())
seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+ if (fsopt->caps_max)
+ seq_printf(m, ",caps_max=%d", fsopt->caps_max);
if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
seq_printf(m, ",caps_wanted_delay_min=%d",
fsopt->caps_wanted_delay_min);
@@ -671,6 +680,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
if (!fsc->trunc_wq)
goto fail_pg_inv_wq;
+ fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1);
+ if (!fsc->cap_wq)
+ goto fail_trunc_wq;
/* set up mempools */
err = -ENOMEM;
@@ -678,13 +690,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
size = sizeof (struct page *) * (page_count ? page_count : 1);
fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
if (!fsc->wb_pagevec_pool)
- goto fail_trunc_wq;
-
- /* caps */
- fsc->min_caps = fsopt->max_readdir;
+ goto fail_cap_wq;
return fsc;
+fail_cap_wq:
+ destroy_workqueue(fsc->cap_wq);
fail_trunc_wq:
destroy_workqueue(fsc->trunc_wq);
fail_pg_inv_wq:
@@ -706,6 +717,7 @@ static void flush_fs_workqueues(struct ceph_fs_client *fsc)
flush_workqueue(fsc->wb_wq);
flush_workqueue(fsc->pg_inv_wq);
flush_workqueue(fsc->trunc_wq);
+ flush_workqueue(fsc->cap_wq);
}
static void destroy_fs_client(struct ceph_fs_client *fsc)
@@ -715,6 +727,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
destroy_workqueue(fsc->wb_wq);
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
+ destroy_workqueue(fsc->cap_wq);
mempool_destroy(fsc->wb_pagevec_pool);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index dfb64a5211b6..16c03188578e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -79,6 +79,7 @@ struct ceph_mount_options {
int rasize; /* max readahead */
int congestion_kb; /* max writeback in flight */
int caps_wanted_delay_min, caps_wanted_delay_max;
+ int caps_max;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
@@ -100,17 +101,18 @@ struct ceph_fs_client {
struct ceph_client *client;
unsigned long mount_state;
- int min_caps; /* min caps i added */
loff_t max_file_size;
struct ceph_mds_client *mdsc;
/* writeback */
mempool_t *wb_pagevec_pool;
+ atomic_long_t writeback_count;
+
struct workqueue_struct *wb_wq;
struct workqueue_struct *pg_inv_wq;
struct workqueue_struct *trunc_wq;
- atomic_long_t writeback_count;
+ struct workqueue_struct *cap_wq;
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
@@ -260,17 +262,22 @@ struct ceph_inode_xattr {
* Ceph dentry state
*/
struct ceph_dentry_info {
+ struct dentry *dentry;
struct ceph_mds_session *lease_session;
+ struct list_head lease_list;
+ unsigned flags;
int lease_shared_gen;
u32 lease_gen;
u32 lease_seq;
unsigned long lease_renew_after, lease_renew_from;
- struct list_head lru;
- struct dentry *dentry;
unsigned long time;
u64 offset;
};
+#define CEPH_DENTRY_REFERENCED 1
+#define CEPH_DENTRY_LEASE_LIST 2
+#define CEPH_DENTRY_SHRINK_LIST 4
+
struct ceph_inode_xattrs_info {
/*
* (still encoded) xattr blob. we avoid the overhead of parsing
@@ -318,6 +325,8 @@ struct ceph_inode_info {
/* quotas */
u64 i_max_bytes, i_max_files;
+ s32 i_dir_pin;
+
struct rb_root i_fragtree;
int i_fragtree_nsplits;
struct mutex i_fragtree_mutex;
@@ -370,7 +379,10 @@ struct ceph_inode_info {
struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
- struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ union {
+ struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
+ };
int i_snap_realm_counter; /* snap realm (if caps) */
struct list_head i_snap_realm_item;
struct list_head i_snap_flush_item;
@@ -587,7 +599,7 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
struct ceph_inode_frag *pfrag,
int *found);
-static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry)
{
return (struct ceph_dentry_info *)dentry->d_fsdata;
}
@@ -656,7 +668,8 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
-extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
+extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+ struct ceph_mount_options *fsopt);
extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -837,6 +850,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+ u64 snap);
+extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+ struct ceph_snapid_map *sm);
+extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
+
+
/*
* a cap_snap is "pending" if it is still awaiting an in-progress
* sync write (that may/may not still update size, mtime, etc.).
@@ -975,11 +996,11 @@ extern void ceph_add_cap(struct inode *inode,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void __ceph_remove_caps(struct inode* inode);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
extern int ceph_is_any_caps(struct inode *inode);
-extern void ceph_queue_caps_release(struct inode *inode);
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
@@ -1049,10 +1070,10 @@ extern int ceph_handle_snapdir(struct ceph_mds_request *req,
extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err);
-extern void ceph_dentry_lru_add(struct dentry *dn);
-extern void ceph_dentry_lru_touch(struct dentry *dn);
-extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di);
+extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern int ceph_trim_dentries(struct ceph_mds_client *mdsc);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 316f6ad10644..0cc42c8879e9 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -228,8 +228,19 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
ci->i_rctime.tv_nsec);
}
-/* quotas */
+/* dir pin */
+static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci)
+{
+ return ci->i_dir_pin != -ENODATA;
+}
+
+static size_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%d", (int)ci->i_dir_pin);
+}
+/* quotas */
static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
{
bool ret = false;
@@ -315,6 +326,13 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_RSTAT_FIELD(dir, rbytes),
XATTR_RSTAT_FIELD(dir, rctime),
{
+ .name = "ceph.dir.pin",
+ .name_size = sizeof("ceph.dir_pin"),
+ .getxattr_cb = ceph_vxattrcb_dir_pin,
+ .exists_cb = ceph_vxattrcb_dir_pin_exists,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
+ {
.name = "ceph.quota",
.name_size = sizeof("ceph.quota"),
.getxattr_cb = ceph_vxattrcb_quota,
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f1ddc9d03c10..76724efc831c 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -117,25 +117,25 @@ config CIFS_UPCALL
secure Kerberos authentication is required). If unsure, say Y.
config CIFS_XATTR
- bool "CIFS extended attributes"
- depends on CIFS
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page for details).
- CIFS maps the name of extended attributes beginning with the user
- namespace prefix to SMB/CIFS EAs. EAs are stored on Windows
- servers without the user namespace prefix, but their names are
- seen by Linux cifs clients prefaced by the user namespace prefix.
- The system namespace (used by some filesystems to store ACLs) is
- not supported at this time.
-
- If unsure, say Y.
+ bool "CIFS extended attributes"
+ depends on CIFS
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page for details).
+ CIFS maps the name of extended attributes beginning with the user
+ namespace prefix to SMB/CIFS EAs. EAs are stored on Windows
+ servers without the user namespace prefix, but their names are
+ seen by Linux cifs clients prefaced by the user namespace prefix.
+ The system namespace (used by some filesystems to store ACLs) is
+ not supported at this time.
+
+ If unsure, say Y.
config CIFS_POSIX
- bool "CIFS POSIX Extensions"
- depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
- help
- Enabling this option will cause the cifs client to attempt to
+ bool "CIFS POSIX Extensions"
+ depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
+ help
+ Enabling this option will cause the cifs client to attempt to
negotiate a newer dialect with servers, such as Samba 3.0.5
or later, that optionally can handle more POSIX like (rather
than Windows like) file behavior. It also enables
@@ -144,61 +144,62 @@ config CIFS_POSIX
CIFS POSIX ACL support. If unsure, say N.
config CIFS_ACL
- bool "Provide CIFS ACL support"
- depends on CIFS_XATTR && KEYS
- help
- Allows fetching CIFS/NTFS ACL from the server. The DACL blob
- is handed over to the application/caller. See the man
- page for getcifsacl for more information. If unsure, say Y.
+ bool "Provide CIFS ACL support"
+ depends on CIFS_XATTR && KEYS
+ help
+ Allows fetching CIFS/NTFS ACL from the server. The DACL blob
+ is handed over to the application/caller. See the man
+ page for getcifsacl for more information. If unsure, say Y.
config CIFS_DEBUG
bool "Enable CIFS debugging routines"
default y
depends on CIFS
help
- Enabling this option adds helpful debugging messages to
- the cifs code which increases the size of the cifs module.
- If unsure, say Y.
+ Enabling this option adds helpful debugging messages to
+ the cifs code which increases the size of the cifs module.
+ If unsure, say Y.
+
config CIFS_DEBUG2
bool "Enable additional CIFS debugging routines"
depends on CIFS_DEBUG
help
- Enabling this option adds a few more debugging routines
- to the cifs code which slightly increases the size of
- the cifs module and can cause additional logging of debug
- messages in some error paths, slowing performance. This
- option can be turned off unless you are debugging
- cifs problems. If unsure, say N.
+ Enabling this option adds a few more debugging routines
+ to the cifs code which slightly increases the size of
+ the cifs module and can cause additional logging of debug
+ messages in some error paths, slowing performance. This
+ option can be turned off unless you are debugging
+ cifs problems. If unsure, say N.
config CIFS_DEBUG_DUMP_KEYS
bool "Dump encryption keys for offline decryption (Unsafe)"
depends on CIFS_DEBUG
help
- Enabling this will dump the encryption and decryption keys
- used to communicate on an encrypted share connection on the
- console. This allows Wireshark to decrypt and dissect
- encrypted network captures. Enable this carefully.
- If unsure, say N.
+ Enabling this will dump the encryption and decryption keys
+ used to communicate on an encrypted share connection on the
+ console. This allows Wireshark to decrypt and dissect
+ encrypted network captures. Enable this carefully.
+ If unsure, say N.
config CIFS_DFS_UPCALL
- bool "DFS feature support"
- depends on CIFS && KEYS
- select DNS_RESOLVER
- help
- Distributed File System (DFS) support is used to access shares
- transparently in an enterprise name space, even if the share
- moves to a different server. This feature also enables
- an upcall mechanism for CIFS which contacts userspace helper
- utilities to provide server name resolution (host names to
- IP addresses) which is needed in order to reconnect to
- servers if their addresses change or for implicit mounts of
- DFS junction points. If unsure, say Y.
+ bool "DFS feature support"
+ depends on CIFS && KEYS
+ select DNS_RESOLVER
+ help
+ Distributed File System (DFS) support is used to access shares
+ transparently in an enterprise name space, even if the share
+ moves to a different server. This feature also enables
+ an upcall mechanism for CIFS which contacts userspace helper
+ utilities to provide server name resolution (host names to
+ IP addresses) which is needed in order to reconnect to
+ servers if their addresses change or for implicit mounts of
+ DFS junction points. If unsure, say Y.
config CIFS_NFSD_EXPORT
- bool "Allow nfsd to export CIFS file system"
- depends on CIFS && BROKEN
- help
- Allows NFS server to export a CIFS mounted share (nfsd over cifs)
+ bool "Allow nfsd to export CIFS file system"
+ depends on CIFS && BROKEN
+ help
+ Allows NFS server to export a CIFS mounted share (nfsd over cifs)
config CIFS_SMB_DIRECT
bool "SMB Direct support (Experimental)"
@@ -209,10 +210,9 @@ config CIFS_SMB_DIRECT
say N.
config CIFS_FSCACHE
- bool "Provide CIFS client caching support"
- depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
- help
- Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
- to be cached locally on disk through the general filesystem cache
- manager. If unsure, say N.
-
+ bool "Provide CIFS client caching support"
+ depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
+ help
+ Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
+ to be cached locally on disk through the general filesystem cache
+ manager. If unsure, say N.
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index e92a2fee3c57..13c1288b04a7 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -115,7 +115,12 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
seq_puts(m, " type: CDROM ");
else
seq_printf(m, " type: %d ", dev_type);
- if (tcon->seal)
+
+ seq_printf(m, "Serial Number: 0x%x", tcon->vol_serial_number);
+
+ if ((tcon->seal) ||
+ (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) ||
+ (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA))
seq_printf(m, " Encrypted");
if (tcon->nocase)
seq_printf(m, " nocase");
@@ -371,6 +376,10 @@ skip_rdma:
atomic_read(&server->in_send),
atomic_read(&server->num_waiters));
#endif
+ if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+ seq_puts(m, " encrypted");
+ if (ses->sign)
+ seq_puts(m, " signed");
seq_puts(m, "\n\tShares:");
j = 0;
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d9b99abe1243..5d83c924cc47 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -285,9 +285,9 @@ static void dump_referral(const struct dfs_info3_param *ref)
{
cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
- cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n",
+ cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n",
ref->flags, ref->server_type);
- cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n",
+ cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n",
ref->ref_flag, ref->path_consumed);
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 42f0d67f1054..ed49222abecb 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -58,6 +58,7 @@ struct cifs_sb_info {
spinlock_t tlink_tree_lock;
struct tcon_link *master_tlink;
struct nls_table *local_nls;
+ unsigned int bsize;
unsigned int rsize;
unsigned int wsize;
unsigned long actimeo; /* attribute cache timeout (jiffies) */
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index d8bce2f862de..086ddc5108af 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -43,6 +43,9 @@ struct smb_snapshot_array {
/* snapshots[]; */
} __packed;
+/* query_info flags */
+#define PASSTHRU_QUERY_INFO 0x00000000
+#define PASSTHRU_FSCTL 0x00000001
struct smb_query_info {
__u32 info_type;
__u32 file_info_class;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 62d48d486d8f..a05bf1d6e1d0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -381,7 +381,7 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
seq_puts(s, "ntlm");
break;
case Kerberos:
- seq_puts(s, "krb5");
+ seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid));
break;
case RawNTLMSSP:
seq_puts(s, "ntlmssp");
@@ -554,10 +554,13 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",rsize=%u", cifs_sb->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->wsize);
+ seq_printf(s, ",bsize=%u", cifs_sb->bsize);
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
if (tcon->snapshot_time)
seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
+ if (tcon->handle_timeout)
+ seq_printf(s, ",handletimeout=%u", tcon->handle_timeout);
/* convert actimeo and display it in seconds */
seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
@@ -1007,7 +1010,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
unsigned int xid;
int rc;
- if (remap_flags & ~REMAP_FILE_ADVISORY)
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
cifs_dbg(FYI, "clone range\n");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7652551a1fc4..5c0298b9998f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.17"
+#define CIFS_VERSION "2.19"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94dbdbe5be34..585ad3207cb1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -60,6 +60,12 @@
#define CIFS_MAX_ACTIMEO (1 << 30)
/*
+ * Max persistent and resilient handle timeout (milliseconds).
+ * Windows durable max was 960000 (16 minutes)
+ */
+#define SMB3_MAX_HANDLE_TIMEOUT 960000
+
+/*
* MAX_REQ is the maximum number of requests that WE will send
* on one socket concurrently.
*/
@@ -216,6 +222,7 @@ struct cifs_io_parms;
struct cifs_search_info;
struct cifsInodeInfo;
struct cifs_open_parms;
+struct cifs_credits;
struct smb_version_operations {
int (*send_cancel)(struct TCP_Server_Info *, struct smb_rqst *,
@@ -230,12 +237,15 @@ struct smb_version_operations {
/* check response: verify signature, map error */
int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *,
bool);
- void (*add_credits)(struct TCP_Server_Info *, const unsigned int,
- const int);
+ void (*add_credits)(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits,
+ const int optype);
void (*set_credits)(struct TCP_Server_Info *, const int);
int * (*get_credits_field)(struct TCP_Server_Info *, const int);
unsigned int (*get_credits)(struct mid_q_entry *);
__u64 (*get_next_mid)(struct TCP_Server_Info *);
+ void (*revert_current_mid)(struct TCP_Server_Info *server,
+ const unsigned int val);
/* data offset from read response message */
unsigned int (*read_data_offset)(char *);
/*
@@ -383,8 +393,8 @@ struct smb_version_operations {
struct cifs_fid *);
/* calculate a size of SMB message */
unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi);
- /* check for STATUS_PENDING and process it in a positive case */
- bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
+ /* check for STATUS_PENDING and process the response if yes */
+ bool (*is_status_pending)(char *buf, struct TCP_Server_Info *server);
/* check for STATUS_NETWORK_SESSION_EXPIRED */
bool (*is_session_expired)(char *);
/* send oplock break response */
@@ -452,7 +462,11 @@ struct smb_version_operations {
unsigned int (*wp_retry_size)(struct inode *);
/* get mtu credits */
int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
- unsigned int *, unsigned int *);
+ unsigned int *, struct cifs_credits *);
+ /* adjust previously taken mtu credits to request size */
+ int (*adjust_credits)(struct TCP_Server_Info *server,
+ struct cifs_credits *credits,
+ const unsigned int payload_size);
/* check if we need to issue closedir */
bool (*dir_needs_close)(struct cifsFileInfo *);
long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
@@ -471,6 +485,14 @@ struct smb_version_operations {
struct cifs_tcon *tcon,
__le16 *path, int is_dir,
unsigned long p);
+ /* make unix special files (block, char, fifo, socket) */
+ int (*make_node)(unsigned int xid,
+ struct inode *inode,
+ struct dentry *dentry,
+ struct cifs_tcon *tcon,
+ char *full_path,
+ umode_t mode,
+ dev_t device_number);
};
struct smb_version_values {
@@ -557,6 +579,7 @@ struct smb_vol {
bool resilient:1; /* noresilient not required since not fored for CA */
bool domainauto:1;
bool rdma:1;
+ unsigned int bsize;
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -569,6 +592,7 @@ struct smb_vol {
struct nls_table *local_nls;
unsigned int echo_interval; /* echo interval in secs */
__u64 snapshot_time; /* needed for timewarp tokens */
+ __u32 handle_timeout; /* persistent and durable handle timeout in ms */
unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
};
@@ -710,6 +734,11 @@ struct TCP_Server_Info {
int nr_targets;
};
+struct cifs_credits {
+ unsigned int value;
+ unsigned int instance;
+};
+
static inline unsigned int
in_flight(struct TCP_Server_Info *server)
{
@@ -721,28 +750,28 @@ in_flight(struct TCP_Server_Info *server)
}
static inline bool
-has_credits(struct TCP_Server_Info *server, int *credits)
+has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
{
int num;
spin_lock(&server->req_lock);
num = *credits;
spin_unlock(&server->req_lock);
- return num > 0;
+ return num >= num_credits;
}
static inline void
-add_credits(struct TCP_Server_Info *server, const unsigned int add,
+add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits,
const int optype)
{
- server->ops->add_credits(server, add, optype);
+ server->ops->add_credits(server, credits, optype);
}
static inline void
-add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+add_credits_and_wake_if(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
- if (add) {
- server->ops->add_credits(server, add, optype);
+ if (credits->value) {
+ server->ops->add_credits(server, credits, optype);
wake_up(&server->request_q);
}
}
@@ -753,6 +782,14 @@ set_credits(struct TCP_Server_Info *server, const int val)
server->ops->set_credits(server, val);
}
+static inline int
+adjust_credits(struct TCP_Server_Info *server, struct cifs_credits *credits,
+ const unsigned int payload_size)
+{
+ return server->ops->adjust_credits ?
+ server->ops->adjust_credits(server, credits, payload_size) : 0;
+}
+
static inline __le64
get_next_mid64(struct TCP_Server_Info *server)
{
@@ -770,6 +807,22 @@ get_next_mid(struct TCP_Server_Info *server)
return cpu_to_le16(mid);
}
+static inline void
+revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+ if (server->ops->revert_current_mid)
+ server->ops->revert_current_mid(server, val);
+}
+
+static inline void
+revert_current_mid_from_hdr(struct TCP_Server_Info *server,
+ const struct smb2_sync_hdr *shdr)
+{
+ unsigned int num = le16_to_cpu(shdr->CreditCharge);
+
+ return revert_current_mid(server, num > 0 ? num : 1);
+}
+
static inline __u16
get_mid(const struct smb_hdr *smb)
{
@@ -924,11 +977,14 @@ cap_unix(struct cifs_ses *ses)
struct cached_fid {
bool is_valid:1; /* Do we have a useable root fid */
+ bool file_all_info_is_valid:1;
+
struct kref refcount;
struct cifs_fid *fid;
struct mutex fid_mutex;
struct cifs_tcon *tcon;
struct work_struct lease_break;
+ struct smb2_file_all_info file_all_info;
};
/*
@@ -1009,6 +1065,7 @@ struct cifs_tcon {
__u32 vol_serial_number;
__le64 vol_create_time;
__u64 snapshot_time; /* for timewarp tokens - timestamp of snapshot */
+ __u32 handle_timeout; /* persistent and durable handle timeout in ms */
__u32 ss_flags; /* sector size flags */
__u32 perf_sector_size; /* best sector size for perf */
__u32 max_chunks;
@@ -1234,7 +1291,7 @@ struct cifs_readdata {
unsigned int pagesz;
unsigned int page_offset;
unsigned int tailsz;
- unsigned int credits;
+ struct cifs_credits credits;
unsigned int nr_pages;
struct page **pages;
};
@@ -1260,7 +1317,7 @@ struct cifs_writedata {
unsigned int pagesz;
unsigned int page_offset;
unsigned int tailsz;
- unsigned int credits;
+ struct cifs_credits credits;
unsigned int nr_pages;
struct page **pages;
};
@@ -1276,6 +1333,7 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file)
}
struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file);
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr);
void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
#define CIFS_CACHE_READ_FLG 1
@@ -1422,6 +1480,7 @@ struct mid_q_entry {
struct kref refcount;
struct TCP_Server_Info *server; /* server corresponding to this mid */
__u64 mid; /* multiplex id */
+ __u16 credits; /* number of credits consumed by this mid */
__u32 pid; /* process id */
__u32 sequence_number; /* for CIFS signing */
unsigned long when_alloc; /* when mid was created */
@@ -1696,6 +1755,7 @@ require use of the stronger protocol */
* GlobalMid_Lock protects:
* list operations on pending_mid_q and oplockQ
* updates to XID counters, multiplex id and SMB sequence numbers
+ * list operations on global DnotifyReqList
* tcp_ses_lock protects:
* list operations on tcp and SMB session lists
* tcon->open_file_lock protects the list of open files hanging off the tcon
@@ -1796,6 +1856,7 @@ GLOBAL_EXTERN spinlock_t gidsidlock;
#endif /* CONFIG_CIFS_ACL */
void cifs_oplock_break(struct work_struct *work);
+void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 336c116995d7..4f96b3b00a7a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -93,7 +93,8 @@ extern int cifs_discard_remaining_data(struct TCP_Server_Info *server);
extern int cifs_call_async(struct TCP_Server_Info *server,
struct smb_rqst *rqst,
mid_receive_t *receive, mid_callback_t *callback,
- mid_handle_t *handle, void *cbdata, const int flags);
+ mid_handle_t *handle, void *cbdata, const int flags,
+ const struct cifs_credits *exist_credits);
extern int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
struct smb_rqst *rqst, int *resp_buf_type,
const int flags, struct kvec *resp_iov);
@@ -115,7 +116,7 @@ extern int cifs_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
unsigned int size, unsigned int *num,
- unsigned int *credits);
+ struct cifs_credits *credits);
extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec *, int /* nvec to send */,
int * /* type of buf returned */, const int flags,
@@ -133,6 +134,9 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
+extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
+ bool fsuid_only,
+ struct cifsFileInfo **ret_file);
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index bb54ccf8481c..f43747c062a7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -139,8 +139,8 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
return -ENOMEM;
if (tcon->ipc) {
- snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
- tcon->ses->server->hostname);
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+ tcon->ses->server->hostname);
rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
goto out;
}
@@ -172,7 +172,7 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
continue;
}
- snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
if (!rc)
@@ -822,9 +822,10 @@ static void
cifs_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
DeleteMidQEntry(mid);
- add_credits(server, 1, CIFS_ECHO_OP);
+ add_credits(server, &credits, CIFS_ECHO_OP);
}
int
@@ -859,7 +860,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
iov[1].iov_base = (char *)smb + 4;
rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, NULL,
- server, CIFS_ASYNC_OP | CIFS_ECHO_OP);
+ server, CIFS_ASYNC_OP | CIFS_ECHO_OP, NULL);
if (rc)
cifs_dbg(FYI, "Echo request failed: %d\n", rc);
@@ -1605,16 +1606,17 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, 0)) {
+ server->ops->is_status_pending(buf, server)) {
cifs_discard_remaining_data(server);
return -1;
}
/* set up first two iov for signature check and to get credits */
rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = 4;
- rdata->iov[1].iov_base = buf + 4;
- rdata->iov[1].iov_len = server->total_read - 4;
+ rdata->iov[0].iov_len = server->vals->header_preamble_size;
+ rdata->iov[1].iov_base = buf + server->vals->header_preamble_size;
+ rdata->iov[1].iov_len =
+ server->total_read - server->vals->header_preamble_size;
cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
rdata->iov[0].iov_base, rdata->iov[0].iov_len);
cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
@@ -1713,6 +1715,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
.rq_npages = rdata->nr_pages,
.rq_pagesz = rdata->pagesz,
.rq_tailsz = rdata->tailsz };
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
__func__, mid->mid, mid->mid_state, rdata->result,
@@ -1750,7 +1753,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &rdata->work);
DeleteMidQEntry(mid);
- add_credits(server, 1, 0);
+ add_credits(server, &credits, 0);
}
/* cifs_async_readv - send an async write, and set up mid to handle result */
@@ -1809,7 +1812,7 @@ cifs_async_readv(struct cifs_readdata *rdata)
kref_get(&rdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
- cifs_readv_callback, NULL, rdata, 0);
+ cifs_readv_callback, NULL, rdata, 0, NULL);
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
@@ -2123,14 +2126,18 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
wdata2->tailsz = tailsz;
wdata2->bytes = cur_len;
- wdata2->cfile = find_writable_file(CIFS_I(inode), false);
+ rc = cifs_get_writable_file(CIFS_I(inode), false,
+ &wdata2->cfile);
if (!wdata2->cfile) {
- cifs_dbg(VFS, "No writable handles for inode\n");
- rc = -EBADF;
- break;
+ cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
+ rc);
+ if (!is_retryable_error(rc))
+ rc = -EBADF;
+ } else {
+ wdata2->pid = wdata2->cfile->pid;
+ rc = server->ops->async_writev(wdata2,
+ cifs_writedata_release);
}
- wdata2->pid = wdata2->cfile->pid;
- rc = server->ops->async_writev(wdata2, cifs_writedata_release);
for (j = 0; j < nr_pages; j++) {
unlock_page(wdata2->pages[j]);
@@ -2145,6 +2152,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
kref_put(&wdata2->refcount, cifs_writedata_release);
if (is_retryable_error(rc))
continue;
+ i += nr_pages;
break;
}
@@ -2152,6 +2160,13 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
i += nr_pages;
} while (i < wdata->nr_pages);
+ /* cleanup remaining pages from the original wdata */
+ for (; i < wdata->nr_pages; i++) {
+ SetPageError(wdata->pages[i]);
+ end_page_writeback(wdata->pages[i]);
+ put_page(wdata->pages[i]);
+ }
+
if (rc != 0 && !is_retryable_error(rc))
mapping_set_error(inode->i_mapping, rc);
kref_put(&wdata->refcount, cifs_writedata_release);
@@ -2226,6 +2241,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
unsigned int written;
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
@@ -2261,7 +2277,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &wdata->work);
DeleteMidQEntry(mid);
- add_credits(tcon->ses->server, 1, 0);
+ add_credits(tcon->ses->server, &credits, 0);
}
/* cifs_async_writev - send an async write, and set up mid to handle result */
@@ -2339,7 +2355,7 @@ cifs_async_writev(struct cifs_writedata *wdata,
kref_get(&wdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
- cifs_writev_callback, NULL, wdata, 0);
+ cifs_writev_callback, NULL, wdata, 0, NULL);
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8463c940e0e5..4c0e44489f21 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -102,8 +102,8 @@ enum {
Opt_backupuid, Opt_backupgid, Opt_uid,
Opt_cruid, Opt_gid, Opt_file_mode,
Opt_dirmode, Opt_port,
- Opt_rsize, Opt_wsize, Opt_actimeo,
- Opt_echo_interval, Opt_max_credits,
+ Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo,
+ Opt_echo_interval, Opt_max_credits, Opt_handletimeout,
Opt_snapshot,
/* Mount options which take string value */
@@ -204,9 +204,11 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_dirmode, "dirmode=%s" },
{ Opt_dirmode, "dir_mode=%s" },
{ Opt_port, "port=%s" },
+ { Opt_blocksize, "bsize=%s" },
{ Opt_rsize, "rsize=%s" },
{ Opt_wsize, "wsize=%s" },
{ Opt_actimeo, "actimeo=%s" },
+ { Opt_handletimeout, "handletimeout=%s" },
{ Opt_echo_interval, "echo_interval=%s" },
{ Opt_max_credits, "max_credits=%s" },
{ Opt_snapshot, "snapshot=%s" },
@@ -348,7 +350,7 @@ static int reconn_set_ipaddr(struct TCP_Server_Info *server)
cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__);
return -ENOMEM;
}
- snprintf(unc, len, "\\\\%s", server->hostname);
+ scnprintf(unc, len, "\\\\%s", server->hostname);
rc = dns_resolve_server_name_to_ip(unc, &ipaddr);
kfree(unc);
@@ -592,6 +594,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
+ set_credits(server, 1);
spin_lock(&GlobalMid_Lock);
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsNeedNegotiate;
@@ -1053,7 +1056,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, length))
+ server->ops->is_status_pending(buf, server))
return -1;
if (!mid)
@@ -1063,6 +1066,26 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return 0;
}
+static void
+smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
+{
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+
+ /*
+ * SMB1 does not use credits.
+ */
+ if (server->vals->header_preamble_size)
+ return;
+
+ if (shdr->CreditRequest) {
+ spin_lock(&server->req_lock);
+ server->credits += le16_to_cpu(shdr->CreditRequest);
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ }
+}
+
+
static int
cifs_demultiplex_thread(void *p)
{
@@ -1169,10 +1192,6 @@ next_pdu:
continue;
}
- if (server->large_buf)
- buf = server->bigbuf;
-
-
server->lstrp = jiffies;
for (i = 0; i < num_mids; i++) {
@@ -1192,6 +1211,7 @@ next_pdu:
} else if (server->ops->is_oplock_break &&
server->ops->is_oplock_break(bufs[i],
server)) {
+ smb2_add_credits_from_hdr(bufs[i], server);
cifs_dbg(FYI, "Received oplock break\n");
} else {
cifs_dbg(VFS, "No task to wake, unknown frame "
@@ -1203,6 +1223,7 @@ next_pdu:
if (server->ops->dump_detail)
server->ops->dump_detail(bufs[i],
server);
+ smb2_add_credits_from_hdr(bufs[i], server);
cifs_dump_mids(server);
#endif /* CIFS_DEBUG2 */
}
@@ -1486,6 +1507,11 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol)
const char *delims = "/\\";
size_t len;
+ if (unlikely(!devname || !*devname)) {
+ cifs_dbg(VFS, "Device name not specified.\n");
+ return -EINVAL;
+ }
+
/* make sure we have a valid UNC double delimiter prefix */
len = strspn(devname, delims);
if (len != 2)
@@ -1571,7 +1597,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->cred_uid = current_uid();
vol->linux_uid = current_uid();
vol->linux_gid = current_gid();
-
+ vol->bsize = 1024 * 1024; /* can improve cp performance significantly */
/*
* default to SFM style remapping of seven reserved characters
* unless user overrides it or we negotiate CIFS POSIX where
@@ -1594,6 +1620,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->actimeo = CIFS_DEF_ACTIMEO;
+ /* Most clients set timeout to 0, allows server to use its default */
+ vol->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */
+
/* offer SMB2.1 and later (SMB3 etc). Secure and widely accepted */
vol->ops = &smb30_operations;
vol->vals = &smbdefault_values;
@@ -1944,6 +1973,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
port = (unsigned short)option;
break;
+ case Opt_blocksize:
+ if (get_option_ul(args, &option)) {
+ cifs_dbg(VFS, "%s: Invalid blocksize value\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ /*
+ * inode blocksize realistically should never need to be
+ * less than 16K or greater than 16M and default is 1MB.
+ * Note that small inode block sizes (e.g. 64K) can lead
+ * to very poor performance of common tools like cp and scp
+ */
+ if ((option < CIFS_MAX_MSGSIZE) ||
+ (option > (4 * SMB3_DEFAULT_IOSIZE))) {
+ cifs_dbg(VFS, "%s: Invalid blocksize\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->bsize = option;
+ break;
case Opt_rsize:
if (get_option_ul(args, &option)) {
cifs_dbg(VFS, "%s: Invalid rsize value\n",
@@ -1972,6 +2021,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto cifs_parse_mount_err;
}
break;
+ case Opt_handletimeout:
+ if (get_option_ul(args, &option)) {
+ cifs_dbg(VFS, "%s: Invalid handletimeout value\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->handle_timeout = option;
+ if (vol->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) {
+ cifs_dbg(VFS, "Invalid handle cache timeout, longer than 16 minutes\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
case Opt_echo_interval:
if (get_option_ul(args, &option)) {
cifs_dbg(VFS, "%s: Invalid echo interval value\n",
@@ -2609,7 +2670,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
- tcp_ses->reconnect_instance = 0;
+ tcp_ses->reconnect_instance = 1;
tcp_ses->lstrp = jiffies;
spin_lock_init(&tcp_ses->req_lock);
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
@@ -2770,7 +2831,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info)
if (tcon == NULL)
return -ENOMEM;
- snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname);
+ scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname);
/* cannot fail */
nls_codepage = load_nls_default();
@@ -3138,6 +3199,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb_vol *volume_info)
return 0;
if (tcon->snapshot_time != volume_info->snapshot_time)
return 0;
+ if (tcon->handle_timeout != volume_info->handle_timeout)
+ return 0;
return 1;
}
@@ -3252,6 +3315,16 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
tcon->snapshot_time = volume_info->snapshot_time;
}
+ if (volume_info->handle_timeout) {
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "Use SMB2.1 or later for handle timeout option\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+ } else
+ tcon->handle_timeout = volume_info->handle_timeout;
+ }
+
tcon->ses = ses;
if (volume_info->password) {
tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
@@ -3839,6 +3912,7 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
spin_lock_init(&cifs_sb->tlink_tree_lock);
cifs_sb->tlink_tree = RB_ROOT;
+ cifs_sb->bsize = pvolume_info->bsize;
/*
* Temporarily set r/wsize for matching superblock. If we end up using
* new sb then client will later negotiate it downward if needed.
@@ -4198,7 +4272,7 @@ static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it,
new_unc = kmalloc(len, GFP_KERNEL);
if (!new_unc)
return -ENOMEM;
- snprintf(new_unc, len, "\\%s", tgt);
+ scnprintf(new_unc, len, "\\%s", tgt);
kfree(vol->UNC);
vol->UNC = new_unc;
@@ -4902,8 +4976,6 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses)
if (!server->ops->need_neg(server))
return 0;
- set_credits(server, 1);
-
rc = server->ops->negotiate(xid, ses);
if (rc == 0) {
spin_lock(&GlobalMid_Lock);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 907e85d65bb4..f26a48dd2e39 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -621,20 +621,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
{
int rc = -EPERM;
unsigned int xid;
- int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
- struct cifs_io_parms io_parms;
char *full_path = NULL;
- struct inode *newinode = NULL;
- __u32 oplock = 0;
- struct cifs_fid fid;
- struct cifs_open_parms oparms;
- FILE_ALL_INFO *buf = NULL;
- unsigned int bytes_written;
- struct win_dev *pdev;
- struct kvec iov[2];
if (!old_valid_dev(device_number))
return -EINVAL;
@@ -654,103 +644,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
goto mknod_out;
}
- if (tcon->unix_ext) {
- struct cifs_unix_set_info_args args = {
- .mode = mode & ~current_umask(),
- .ctime = NO_CHANGE_64,
- .atime = NO_CHANGE_64,
- .mtime = NO_CHANGE_64,
- .device = device_number,
- };
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- args.uid = current_fsuid();
- args.gid = current_fsgid();
- } else {
- args.uid = INVALID_UID; /* no change */
- args.gid = INVALID_GID; /* no change */
- }
- rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- if (rc)
- goto mknod_out;
-
- rc = cifs_get_inode_info_unix(&newinode, full_path,
- inode->i_sb, xid);
-
- if (rc == 0)
- d_instantiate(direntry, newinode);
- goto mknod_out;
- }
-
- if (!S_ISCHR(mode) && !S_ISBLK(mode))
- goto mknod_out;
-
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- goto mknod_out;
-
-
- cifs_dbg(FYI, "sfu compat create special file\n");
-
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto mknod_out;
- }
-
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = create_options;
- oparms.disposition = FILE_CREATE;
- oparms.path = full_path;
- oparms.fid = &fid;
- oparms.reconnect = false;
-
- if (tcon->ses->server->oplocks)
- oplock = REQ_OPLOCK;
- else
- oplock = 0;
- rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
- if (rc)
- goto mknod_out;
-
- /*
- * BB Do not bother to decode buf since no local inode yet to put
- * timestamps in, but we can reuse it safely.
- */
-
- pdev = (struct win_dev *)buf;
- io_parms.pid = current->tgid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = sizeof(struct win_dev);
- iov[1].iov_base = buf;
- iov[1].iov_len = sizeof(struct win_dev);
- if (S_ISCHR(mode)) {
- memcpy(pdev->type, "IntxCHR", 8);
- pdev->major = cpu_to_le64(MAJOR(device_number));
- pdev->minor = cpu_to_le64(MINOR(device_number));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
- } else if (S_ISBLK(mode)) {
- memcpy(pdev->type, "IntxBLK", 8);
- pdev->major = cpu_to_le64(MAJOR(device_number));
- pdev->minor = cpu_to_le64(MINOR(device_number));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
- }
- tcon->ses->server->ops->close(xid, tcon, &fid);
- d_drop(direntry);
-
- /* FIXME: add code here to set EAs */
+ rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon,
+ full_path, mode,
+ device_number);
mknod_out:
kfree(full_path);
- kfree(buf);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 659ce1b92c44..7037a137fa53 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -360,13 +360,31 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file)
return cifs_file;
}
-/*
- * Release a reference on the file private data. This may involve closing
- * the filehandle out on the server. Must be called without holding
- * tcon->open_file_lock and cifs_file->file_info_lock.
+/**
+ * cifsFileInfo_put - release a reference of file priv data
+ *
+ * Always potentially wait for oplock handler. See _cifsFileInfo_put().
*/
void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
{
+ _cifsFileInfo_put(cifs_file, true);
+}
+
+/**
+ * _cifsFileInfo_put - release a reference of file priv data
+ *
+ * This may involve closing the filehandle @cifs_file out on the
+ * server. Must be called without holding tcon->open_file_lock and
+ * cifs_file->file_info_lock.
+ *
+ * If @wait_for_oplock_handler is true and we are releasing the last
+ * reference, wait for any running oplock break handler of the file
+ * and cancel any pending one. If calling this function from the
+ * oplock break handler, you need to pass false.
+ *
+ */
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
+{
struct inode *inode = d_inode(cifs_file->dentry);
struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -414,7 +432,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
spin_unlock(&tcon->open_file_lock);
- oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break);
+ oplock_break_cancelled = wait_oplock_handler ?
+ cancel_work_sync(&cifs_file->oplock_break) : false;
if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
struct TCP_Server_Info *server = tcon->ses->server;
@@ -1645,8 +1664,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
rc = server->ops->mand_unlock_range(cfile, flock, xid);
out:
- if (flock->fl_flags & FL_POSIX && !rc)
+ if (flock->fl_flags & FL_POSIX) {
+ /*
+ * If this is a request to remove all locks because we
+ * are closing the file, it doesn't matter if the
+ * unlocking failed as both cifs.ko and the SMB server
+ * remove the lock on file close
+ */
+ if (rc) {
+ cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc);
+ if (!(flock->fl_flags & FL_CLOSE))
+ return rc;
+ }
rc = locks_lock_file_wait(file, flock);
+ }
return rc;
}
@@ -1842,24 +1873,30 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
return NULL;
}
-struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
- bool fsuid_only)
+/* Return -EBADF if no handle is found and general rc otherwise */
+int
+cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
+ struct cifsFileInfo **ret_file)
{
struct cifsFileInfo *open_file, *inv_file = NULL;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
bool any_available = false;
- int rc;
+ int rc = -EBADF;
unsigned int refind = 0;
- /* Having a null inode here (because mapping->host was set to zero by
- the VFS or MM) should not happen but we had reports of on oops (due to
- it being zero) during stress testcases so we need to check for it */
+ *ret_file = NULL;
+
+ /*
+ * Having a null inode here (because mapping->host was set to zero by
+ * the VFS or MM) should not happen but we had reports of on oops (due
+ * to it being zero) during stress testcases so we need to check for it
+ */
if (cifs_inode == NULL) {
cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n");
dump_stack();
- return NULL;
+ return rc;
}
cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
@@ -1873,7 +1910,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
refind_writable:
if (refind > MAX_REOPEN_ATT) {
spin_unlock(&tcon->open_file_lock);
- return NULL;
+ return rc;
}
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
if (!any_available && open_file->pid != current->tgid)
@@ -1885,7 +1922,8 @@ refind_writable:
/* found a good writable file */
cifsFileInfo_get(open_file);
spin_unlock(&tcon->open_file_lock);
- return open_file;
+ *ret_file = open_file;
+ return 0;
} else {
if (!inv_file)
inv_file = open_file;
@@ -1907,22 +1945,35 @@ refind_writable:
if (inv_file) {
rc = cifs_reopen_file(inv_file, false);
- if (!rc)
- return inv_file;
- else {
- spin_lock(&tcon->open_file_lock);
- list_move_tail(&inv_file->flist,
- &cifs_inode->openFileList);
- spin_unlock(&tcon->open_file_lock);
- cifsFileInfo_put(inv_file);
- ++refind;
- inv_file = NULL;
- spin_lock(&tcon->open_file_lock);
- goto refind_writable;
+ if (!rc) {
+ *ret_file = inv_file;
+ return 0;
}
+
+ spin_lock(&tcon->open_file_lock);
+ list_move_tail(&inv_file->flist, &cifs_inode->openFileList);
+ spin_unlock(&tcon->open_file_lock);
+ cifsFileInfo_put(inv_file);
+ ++refind;
+ inv_file = NULL;
+ spin_lock(&tcon->open_file_lock);
+ goto refind_writable;
}
- return NULL;
+ return rc;
+}
+
+struct cifsFileInfo *
+find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only)
+{
+ struct cifsFileInfo *cfile;
+ int rc;
+
+ rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile);
+ if (rc)
+ cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc);
+
+ return cfile;
}
static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
@@ -1959,8 +2010,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
if (mapping->host->i_size - offset < (loff_t)to)
to = (unsigned)(mapping->host->i_size - offset);
- open_file = find_writable_file(CIFS_I(mapping->host), false);
- if (open_file) {
+ rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file);
+ if (!rc) {
bytes_written = cifs_write(open_file, open_file->pid,
write_data, to - from, &offset);
cifsFileInfo_put(open_file);
@@ -1970,9 +2021,12 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
rc = 0;
else if (bytes_written < 0)
rc = bytes_written;
+ else
+ rc = -EFAULT;
} else {
- cifs_dbg(FYI, "No writeable filehandles for inode\n");
- rc = -EIO;
+ cifs_dbg(FYI, "No writable handle for write page rc=%d\n", rc);
+ if (!is_retryable_error(rc))
+ rc = -EIO;
}
kunmap(page);
@@ -2079,9 +2133,9 @@ static int
wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
struct address_space *mapping, struct writeback_control *wbc)
{
- int rc = 0;
- struct TCP_Server_Info *server;
- unsigned int i;
+ int rc;
+ struct TCP_Server_Info *server =
+ tlink_tcon(wdata->cfile->tlink)->ses->server;
wdata->sync_mode = wbc->sync_mode;
wdata->nr_pages = nr_pages;
@@ -2091,21 +2145,16 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
page_offset(wdata->pages[nr_pages - 1]),
(loff_t)PAGE_SIZE);
wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz;
+ wdata->pid = wdata->cfile->pid;
- if (wdata->cfile != NULL)
- cifsFileInfo_put(wdata->cfile);
- wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
- if (!wdata->cfile) {
- cifs_dbg(VFS, "No writable handles for inode\n");
- rc = -EBADF;
- } else {
- wdata->pid = wdata->cfile->pid;
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- rc = server->ops->async_writev(wdata, cifs_writedata_release);
- }
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+ if (rc)
+ return rc;
- for (i = 0; i < nr_pages; ++i)
- unlock_page(wdata->pages[i]);
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_writev(wdata, cifs_writedata_release);
return rc;
}
@@ -2113,11 +2162,13 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
static int cifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
+ struct inode *inode = mapping->host;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct TCP_Server_Info *server;
bool done = false, scanned = false, range_whole = false;
pgoff_t end, index;
struct cifs_writedata *wdata;
+ struct cifsFileInfo *cfile = NULL;
int rc = 0;
int saved_rc = 0;
unsigned int xid;
@@ -2143,11 +2194,23 @@ static int cifs_writepages(struct address_space *mapping,
server = cifs_sb_master_tcon(cifs_sb)->ses->server;
retry:
while (!done && index <= end) {
- unsigned int i, nr_pages, found_pages, wsize, credits;
+ unsigned int i, nr_pages, found_pages, wsize;
pgoff_t next = 0, tofind, saved_index = index;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+ int get_file_rc = 0;
+
+ if (cfile)
+ cifsFileInfo_put(cfile);
+
+ rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile);
+
+ /* in case of an error store it to return later */
+ if (rc)
+ get_file_rc = rc;
rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
- &wsize, &credits);
+ &wsize, credits);
if (rc != 0) {
done = true;
break;
@@ -2180,13 +2243,26 @@ retry:
continue;
}
- wdata->credits = credits;
+ wdata->credits = credits_on_stack;
+ wdata->cfile = cfile;
+ cfile = NULL;
+
+ if (!wdata->cfile) {
+ cifs_dbg(VFS, "No writable handle in writepages rc=%d\n",
+ get_file_rc);
+ if (is_retryable_error(get_file_rc))
+ rc = get_file_rc;
+ else
+ rc = -EBADF;
+ } else
+ rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
- rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
+ for (i = 0; i < nr_pages; ++i)
+ unlock_page(wdata->pages[i]);
/* send failure -- clean up the mess */
if (rc != 0) {
- add_credits_and_wake_if(server, wdata->credits, 0);
+ add_credits_and_wake_if(server, &wdata->credits, 0);
for (i = 0; i < nr_pages; ++i) {
if (is_retryable_error(rc))
redirty_page_for_writepage(wbc,
@@ -2238,6 +2314,8 @@ retry:
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+ if (cfile)
+ cifsFileInfo_put(cfile);
free_xid(xid);
return rc;
}
@@ -2567,47 +2645,62 @@ static int
cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
struct cifs_aio_ctx *ctx)
{
- unsigned int wsize, credits;
+ unsigned int wsize;
+ struct cifs_credits credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(wdata->cfile->tlink)->ses->server;
- /*
- * Wait for credits to resend this wdata.
- * Note: we are attempting to resend the whole wdata not in segments
- */
do {
- rc = server->ops->wait_mtu_credits(
- server, wdata->bytes, &wsize, &credits);
+ if (wdata->cfile->invalidHandle) {
+ rc = cifs_reopen_file(wdata->cfile, false);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
- if (rc)
- goto out;
- if (wsize < wdata->bytes) {
- add_credits_and_wake_if(server, credits, 0);
- msleep(1000);
- }
- } while (wsize < wdata->bytes);
+ /*
+ * Wait for credits to resend this wdata.
+ * Note: we are attempting to resend the whole wdata not in
+ * segments
+ */
+ do {
+ rc = server->ops->wait_mtu_credits(server, wdata->bytes,
+ &wsize, &credits);
+ if (rc)
+ goto fail;
+
+ if (wsize < wdata->bytes) {
+ add_credits_and_wake_if(server, &credits, 0);
+ msleep(1000);
+ }
+ } while (wsize < wdata->bytes);
+ wdata->credits = credits;
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
- rc = 0;
- if (wdata->cfile->invalidHandle)
- rc = cifs_reopen_file(wdata->cfile, false);
- if (!rc)
- rc = server->ops->async_writev(wdata,
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+
+ if (!rc) {
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_writev(wdata,
cifs_uncached_writedata_release);
- }
+ }
- if (!rc) {
- list_add_tail(&wdata->list, wdata_list);
- return 0;
- }
+ /* If the write was successfully sent, we are done */
+ if (!rc) {
+ list_add_tail(&wdata->list, wdata_list);
+ return 0;
+ }
- add_credits_and_wake_if(server, wdata->credits, 0);
-out:
- kref_put(&wdata->refcount, cifs_uncached_writedata_release);
+ /* Roll back credits and retry if needed */
+ add_credits_and_wake_if(server, &wdata->credits, 0);
+ } while (rc == -EAGAIN);
+fail:
+ kref_put(&wdata->refcount, cifs_uncached_writedata_release);
return rc;
}
@@ -2627,6 +2720,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
struct TCP_Server_Info *server;
struct page **pagevec;
size_t start;
+ unsigned int xid;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -2634,12 +2728,23 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
pid = current->tgid;
server = tlink_tcon(open_file->tlink)->ses->server;
+ xid = get_xid();
do {
- unsigned int wsize, credits;
+ unsigned int wsize;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, false);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
- &wsize, &credits);
+ &wsize, credits);
if (rc)
break;
@@ -2731,16 +2836,22 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
wdata->pid = pid;
wdata->bytes = cur_len;
wdata->pagesz = PAGE_SIZE;
- wdata->credits = credits;
+ wdata->credits = credits_on_stack;
wdata->ctx = ctx;
kref_get(&ctx->refcount);
- if (!wdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(wdata->cfile, false)))
- rc = server->ops->async_writev(wdata,
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+
+ if (!rc) {
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_writev(wdata,
cifs_uncached_writedata_release);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, wdata->credits, 0);
+ add_credits_and_wake_if(server, &wdata->credits, 0);
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
if (rc == -EAGAIN) {
@@ -2756,6 +2867,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
len -= cur_len;
} while (len > 0);
+ free_xid(xid);
return rc;
}
@@ -2765,7 +2877,6 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
struct dentry *dentry = ctx->cfile->dentry;
- unsigned int i;
int rc;
tcon = tlink_tcon(ctx->cfile->tlink);
@@ -2816,12 +2927,12 @@ restart_loop:
wdata->bytes, &tmp_from,
ctx->cfile, cifs_sb, &tmp_list,
ctx);
+
+ kref_put(&wdata->refcount,
+ cifs_uncached_writedata_release);
}
list_splice(&tmp_list, &ctx->list);
-
- kref_put(&wdata->refcount,
- cifs_uncached_writedata_release);
goto restart_loop;
}
}
@@ -2829,10 +2940,6 @@ restart_loop:
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
- if (!ctx->direct_io)
- for (i = 0; i < ctx->npages; i++)
- put_page(ctx->bv[i].bv_page);
-
cifs_stats_bytes_written(tcon, ctx->total_len);
set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
@@ -3028,14 +3135,16 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
* these pages but not on the region from pos to ppos+len-1.
*/
written = cifs_user_writev(iocb, from);
- if (written > 0 && CIFS_CACHE_READ(cinode)) {
+ if (CIFS_CACHE_READ(cinode)) {
/*
- * Windows 7 server can delay breaking level2 oplock if a write
- * request comes - break it on the client to prevent reading
- * an old data.
+ * We have read level caching and we have just sent a write
+ * request to the server thus making data in the cache stale.
+ * Zap the cache and set oplock/lease level to NONE to avoid
+ * reading stale data from the cache. All subsequent read
+ * operations will read new data from the server.
*/
cifs_zap_mapping(inode);
- cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
+ cifs_dbg(FYI, "Set Oplock/Lease to NONE for inode=%p after write\n",
inode);
cinode->oplock = 0;
}
@@ -3260,48 +3369,61 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
struct list_head *rdata_list,
struct cifs_aio_ctx *ctx)
{
- unsigned int rsize, credits;
+ unsigned int rsize;
+ struct cifs_credits credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(rdata->cfile->tlink)->ses->server;
- /*
- * Wait for credits to resend this rdata.
- * Note: we are attempting to resend the whole rdata not in segments
- */
do {
- rc = server->ops->wait_mtu_credits(server, rdata->bytes,
+ if (rdata->cfile->invalidHandle) {
+ rc = cifs_reopen_file(rdata->cfile, true);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
+
+ /*
+ * Wait for credits to resend this rdata.
+ * Note: we are attempting to resend the whole rdata not in
+ * segments
+ */
+ do {
+ rc = server->ops->wait_mtu_credits(server, rdata->bytes,
&rsize, &credits);
- if (rc)
- goto out;
+ if (rc)
+ goto fail;
- if (rsize < rdata->bytes) {
- add_credits_and_wake_if(server, credits, 0);
- msleep(1000);
- }
- } while (rsize < rdata->bytes);
+ if (rsize < rdata->bytes) {
+ add_credits_and_wake_if(server, &credits, 0);
+ msleep(1000);
+ }
+ } while (rsize < rdata->bytes);
+ rdata->credits = credits;
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
- rc = 0;
- if (rdata->cfile->invalidHandle)
- rc = cifs_reopen_file(rdata->cfile, true);
- if (!rc)
- rc = server->ops->async_readv(rdata);
- }
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+ if (!rc) {
+ if (rdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_readv(rdata);
+ }
- if (!rc) {
- /* Add to aio pending list */
- list_add_tail(&rdata->list, rdata_list);
- return 0;
- }
+ /* If the read was successfully sent, we are done */
+ if (!rc) {
+ /* Add to aio pending list */
+ list_add_tail(&rdata->list, rdata_list);
+ return 0;
+ }
- add_credits_and_wake_if(server, rdata->credits, 0);
-out:
- kref_put(&rdata->refcount,
- cifs_uncached_readdata_release);
+ /* Roll back credits and retry if needed */
+ add_credits_and_wake_if(server, &rdata->credits, 0);
+ } while (rc == -EAGAIN);
+fail:
+ kref_put(&rdata->refcount, cifs_uncached_readdata_release);
return rc;
}
@@ -3311,7 +3433,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
struct cifs_aio_ctx *ctx)
{
struct cifs_readdata *rdata;
- unsigned int npages, rsize, credits;
+ unsigned int npages, rsize;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
size_t cur_len;
int rc;
pid_t pid;
@@ -3331,8 +3455,16 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
iov_iter_advance(&direct_iov, offset - ctx->pos);
do {
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, true);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
- &rsize, &credits);
+ &rsize, credits);
if (rc)
break;
@@ -3406,15 +3538,21 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
rdata->pagesz = PAGE_SIZE;
rdata->read_into_pages = cifs_uncached_read_into_pages;
rdata->copy_into_pages = cifs_uncached_copy_into_pages;
- rdata->credits = credits;
+ rdata->credits = credits_on_stack;
rdata->ctx = ctx;
kref_get(&ctx->refcount);
- if (!rdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(rdata->cfile, true)))
- rc = server->ops->async_readv(rdata);
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+
+ if (!rc) {
+ if (rdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_readv(rdata);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, rdata->credits, 0);
+ add_credits_and_wake_if(server, &rdata->credits, 0);
kref_put(&rdata->refcount,
cifs_uncached_readdata_release);
if (rc == -EAGAIN) {
@@ -3439,7 +3577,6 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx)
struct iov_iter *to = &ctx->iter;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
- unsigned int i;
int rc;
tcon = tlink_tcon(ctx->cfile->tlink);
@@ -3523,17 +3660,8 @@ again:
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
- if (!ctx->direct_io) {
- for (i = 0; i < ctx->npages; i++) {
- if (ctx->should_dirty)
- set_page_dirty(ctx->bv[i].bv_page);
- put_page(ctx->bv[i].bv_page);
- }
-
+ if (!ctx->direct_io)
ctx->total_len = ctx->len - iov_iter_count(to);
- }
-
- cifs_stats_bytes_read(tcon, ctx->total_len);
/* mask nodata case */
if (rc == -ENODATA)
@@ -4095,10 +4223,19 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
loff_t offset;
struct page *page, *tpage;
struct cifs_readdata *rdata;
- unsigned credits;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, true);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
- &rsize, &credits);
+ &rsize, credits);
if (rc)
break;
@@ -4144,18 +4281,24 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rdata->tailsz = PAGE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
rdata->copy_into_pages = cifs_readpages_copy_into_pages;
- rdata->credits = credits;
+ rdata->credits = credits_on_stack;
list_for_each_entry_safe(page, tpage, &tmplist, lru) {
list_del(&page->lru);
rdata->pages[rdata->nr_pages++] = page;
}
- if (!rdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(rdata->cfile, true)))
- rc = server->ops->async_readv(rdata);
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+
+ if (!rc) {
+ if (rdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_readv(rdata);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, rdata->credits, 0);
+ add_credits_and_wake_if(server, &rdata->credits, 0);
for (i = 0; i < rdata->nr_pages; i++) {
page = rdata->pages[i];
lru_cache_add_file(page);
@@ -4466,6 +4609,7 @@ void cifs_oplock_break(struct work_struct *work)
cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
+ _cifsFileInfo_put(cfile, false /* do not wait for ourself */);
cifs_done_oplock_break(cinode);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 478003644916..538fd7d807e4 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1735,6 +1735,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
if (rc == 0 || rc != -EBUSY)
goto do_rename_exit;
+ /* Don't fall back to using SMB on SMB 2+ mount */
+ if (server->vals->protocol_id != 0)
+ goto do_rename_exit;
+
/* open-file renames don't work across directories */
if (to_dentry->d_parent != from_dentry->d_parent)
goto do_rename_exit;
@@ -2080,7 +2084,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
return rc;
generic_fillattr(inode, stat);
- stat->blksize = CIFS_MAX_MSGSIZE;
+ stat->blksize = cifs_sb->bsize;
stat->ino = CIFS_I(inode)->uniqueid;
/* old CIFS Unix Extensions doesn't return create time */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 2148b0f60e5e..62216dc8f9f5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -103,9 +103,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
return rc;
}
- snprintf(md5_str2, sizeof(md5_str2),
- CIFS_MF_SYMLINK_MD5_FORMAT,
- CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+ scnprintf(md5_str2, sizeof(md5_str2),
+ CIFS_MF_SYMLINK_MD5_FORMAT,
+ CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
if (strncmp(md5_str1, md5_str2, 17) != 0)
return -EINVAL;
@@ -142,10 +142,10 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
return rc;
}
- snprintf(buf, buf_len,
- CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
- link_len,
- CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+ scnprintf(buf, buf_len,
+ CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
+ link_len,
+ CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
memcpy(buf + ofs, link_str, link_len);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index bee203055b30..b1a696a73f7c 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -501,8 +501,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&pCifsInode->flags);
- queue_work(cifsoplockd_wq,
- &netfile->oplock_break);
+ cifs_queue_oplock_break(netfile);
netfile->oplock_break_cancelled = false;
spin_unlock(&tcon->open_file_lock);
@@ -607,6 +606,28 @@ void cifs_put_writer(struct cifsInodeInfo *cinode)
spin_unlock(&cinode->writers_lock);
}
+/**
+ * cifs_queue_oplock_break - queue the oplock break handler for cfile
+ *
+ * This function is called from the demultiplex thread when it
+ * receives an oplock break for @cfile.
+ *
+ * Assumes the tcon->open_file_lock is held.
+ * Assumes cfile->file_info_lock is NOT held.
+ */
+void cifs_queue_oplock_break(struct cifsFileInfo *cfile)
+{
+ /*
+ * Bump the handle refcount now while we hold the
+ * open_file_lock to enforce the validity of it for the oplock
+ * break handler. The matching put is done at the end of the
+ * handler.
+ */
+ cifsFileInfo_get(cfile);
+
+ queue_work(cifsoplockd_wq, &cfile->oplock_break);
+}
+
void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
{
clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
@@ -768,6 +789,11 @@ cifs_aio_ctx_alloc(void)
{
struct cifs_aio_ctx *ctx;
+ /*
+ * Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io
+ * to false so that we know when we have to unreference pages within
+ * cifs_aio_ctx_release()
+ */
ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL);
if (!ctx)
return NULL;
@@ -786,7 +812,23 @@ cifs_aio_ctx_release(struct kref *refcount)
struct cifs_aio_ctx, refcount);
cifsFileInfo_put(ctx->cfile);
- kvfree(ctx->bv);
+
+ /*
+ * ctx->bv is only set if setup_aio_ctx_iter() was call successfuly
+ * which means that iov_iter_get_pages() was a success and thus that
+ * we have taken reference on pages.
+ */
+ if (ctx->bv) {
+ unsigned i;
+
+ for (i = 0; i < ctx->npages; i++) {
+ if (ctx->should_dirty)
+ set_page_dirty(ctx->bv[i].bv_page);
+ put_page(ctx->bv[i].bv_page);
+ }
+ kvfree(ctx->bv);
+ }
+
kfree(ctx);
}
@@ -917,7 +959,6 @@ cifs_alloc_hash(const char *name,
}
(*sdesc)->shash.tfm = *shash;
- (*sdesc)->shash.flags = 0x0;
return 0;
}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 32a6c020478f..c711f1f39bf2 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -117,11 +117,11 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
}
static void
-cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+cifs_add_credits(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
spin_lock(&server->req_lock);
- server->credits += add;
+ server->credits += credits->value;
server->in_flight--;
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
@@ -308,7 +308,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
remaining = tgt_total_cnt - total_in_tgt;
if (remaining < 0) {
- cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n",
+ cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%u\n",
tgt_total_cnt, total_in_tgt);
return -EPROTO;
}
@@ -1027,6 +1027,131 @@ cifs_can_echo(struct TCP_Server_Info *server)
return false;
}
+static int
+cifs_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct inode *newinode = NULL;
+ int rc = -EPERM;
+ int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
+ FILE_ALL_INFO *buf = NULL;
+ struct cifs_io_parms io_parms;
+ __u32 oplock = 0;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ unsigned int bytes_written;
+ struct win_dev *pdev;
+ struct kvec iov[2];
+
+ if (tcon->unix_ext) {
+ /*
+ * SMB1 Unix Extensions: requires server support but
+ * works with all special files
+ */
+ struct cifs_unix_set_info_args args = {
+ .mode = mode & ~current_umask(),
+ .ctime = NO_CHANGE_64,
+ .atime = NO_CHANGE_64,
+ .mtime = NO_CHANGE_64,
+ .device = dev,
+ };
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+ args.uid = current_fsuid();
+ args.gid = current_fsgid();
+ } else {
+ args.uid = INVALID_UID; /* no change */
+ args.gid = INVALID_GID; /* no change */
+ }
+ rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ if (rc)
+ goto out;
+
+ rc = cifs_get_inode_info_unix(&newinode, full_path,
+ inode->i_sb, xid);
+
+ if (rc == 0)
+ d_instantiate(dentry, newinode);
+ goto out;
+ }
+
+ /*
+ * SMB1 SFU emulation: should work with all servers, but only
+ * support block and char device (no socket & fifo)
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+ goto out;
+
+ if (!S_ISCHR(mode) && !S_ISBLK(mode))
+ goto out;
+
+ cifs_dbg(FYI, "sfu compat create special file\n");
+
+ buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+ if (buf == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_WRITE;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_CREATE;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ if (tcon->ses->server->oplocks)
+ oplock = REQ_OPLOCK;
+ else
+ oplock = 0;
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ if (rc)
+ goto out;
+
+ /*
+ * BB Do not bother to decode buf since no local inode yet to put
+ * timestamps in, but we can reuse it safely.
+ */
+
+ pdev = (struct win_dev *)buf;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = sizeof(struct win_dev);
+ iov[1].iov_base = buf;
+ iov[1].iov_len = sizeof(struct win_dev);
+ if (S_ISCHR(mode)) {
+ memcpy(pdev->type, "IntxCHR", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ } else if (S_ISBLK(mode)) {
+ memcpy(pdev->type, "IntxBLK", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ }
+ tcon->ses->server->ops->close(xid, tcon, &fid);
+ d_drop(dentry);
+
+ /* FIXME: add code here to set EAs */
+out:
+ kfree(buf);
+ return rc;
+}
+
+
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1110,6 +1235,7 @@ struct smb_version_operations smb1_operations = {
.get_acl_by_fid = get_cifs_acl_by_fid,
.set_acl = set_cifs_acl,
#endif /* CIFS_ACL */
+ .make_node = cifs_make_node,
};
struct smb_version_values smb1_values = {
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index b204e84b87fb..54bffb2a1786 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -68,13 +68,15 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
if (oparms->tcon->use_resilient) {
- nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */
+ /* default timeout is 0, servers pick default (120 seconds) */
+ nr_ioctl_req.Timeout =
+ cpu_to_le32(oparms->tcon->handle_timeout);
nr_ioctl_req.Reserved = 0;
rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY,
true /* is_fsctl */,
(char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
- NULL, NULL /* no return info */);
+ CIFSMaxBufSize, NULL, NULL /* no return info */);
if (rc == -EOPNOTSUPP) {
cifs_dbg(VFS,
"resiliency not supported by server, disabling\n");
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 01a76bccdb8d..278405d26c47 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -37,6 +37,16 @@
#include "smb2pdu.h"
#include "smb2proto.h"
+static void
+free_set_inf_compound(struct smb_rqst *rqst)
+{
+ if (rqst[1].rq_iov)
+ SMB2_set_info_free(&rqst[1]);
+ if (rqst[2].rq_iov)
+ SMB2_close_free(&rqst[2]);
+}
+
+
static int
smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
@@ -112,14 +122,18 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
PATH_MAX * 2, 0, NULL);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid,
+ full_path);
break;
case SMB2_OP_DELETE:
+ trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_MKDIR:
/*
* Directories are created through parameters in the
* SMB2_open() call.
*/
+ trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_RMDIR:
memset(&si_iov, 0, sizeof(si_iov));
@@ -135,6 +149,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_SET_EOF:
memset(&si_iov, 0, sizeof(si_iov));
@@ -150,6 +165,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_SET_INFO:
memset(&si_iov, 0, sizeof(si_iov));
@@ -166,6 +182,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid,
+ full_path);
break;
case SMB2_OP_RENAME:
memset(&si_iov, 0, sizeof(si_iov));
@@ -190,6 +208,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_HARDLINK:
memset(&si_iov, 0, sizeof(si_iov));
@@ -214,6 +233,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path);
break;
default:
cifs_dbg(VFS, "Invalid command\n");
@@ -252,21 +272,65 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_query_info_free(&rqst[1]);
if (rqst[2].rq_iov)
SMB2_close_free(&rqst[2]);
+ if (rc)
+ trace_smb3_query_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_query_info_compound_done(xid, ses->Suid,
+ tcon->tid);
break;
case SMB2_OP_DELETE:
+ if (rc)
+ trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
+ if (rqst[1].rq_iov)
+ SMB2_close_free(&rqst[1]);
+ break;
case SMB2_OP_MKDIR:
+ if (rc)
+ trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid);
if (rqst[1].rq_iov)
SMB2_close_free(&rqst[1]);
break;
case SMB2_OP_HARDLINK:
+ if (rc)
+ trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_RENAME:
+ if (rc)
+ trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rename_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_RMDIR:
+ if (rc)
+ trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_SET_EOF:
+ if (rc)
+ trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_SET_INFO:
- if (rqst[1].rq_iov)
- SMB2_set_info_free(&rqst[1]);
- if (rqst[2].rq_iov)
- SMB2_close_free(&rqst[2]);
+ if (rc)
+ trace_smb3_set_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_set_info_compound_done(xid, ses->Suid,
+ tcon->tid);
+ free_set_inf_compound(rqst);
break;
}
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
@@ -309,12 +373,17 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = open_shroot(xid, tcon, &fid);
if (rc)
goto out;
- rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
- fid.volatile_fid, smb2_data);
+
+ if (tcon->crfid.file_all_info_is_valid) {
+ move_smb2_info_to_cifs(data,
+ &tcon->crfid.file_all_info);
+ } else {
+ rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, smb2_data);
+ if (!rc)
+ move_smb2_info_to_cifs(data, smb2_data);
+ }
close_shroot(&tcon->crfid);
- if (rc)
- goto out;
- move_smb2_info_to_cifs(data, smb2_data);
goto out;
}
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 924269cec135..e32c264e3adb 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -1036,7 +1036,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_UNFINISHED_CONTEXT_DELETED, -EIO,
"STATUS_UNFINISHED_CONTEXT_DELETED"},
{STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"},
- {STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"},
+ /* Note that ENOATTTR and ENODATA are the same errno */
+ {STATUS_OBJECTID_NOT_FOUND, -ENODATA, "STATUS_OBJECTID_NOT_FOUND"},
{STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"},
{STATUS_WRONG_CREDENTIAL_HANDLE, -EIO,
"STATUS_WRONG_CREDENTIAL_HANDLE"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 7b8b58fb4d3f..e311f58dc1c8 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -517,7 +517,6 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
__u8 lease_state;
struct list_head *tmp;
struct cifsFileInfo *cfile;
- struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_pending_open *open;
struct cifsInodeInfo *cinode;
int ack_req = le32_to_cpu(rsp->Flags &
@@ -537,14 +536,26 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
le32_to_cpu(rsp->NewLeaseState));
- server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
-
if (ack_req)
cfile->oplock_break_cancelled = false;
else
cfile->oplock_break_cancelled = true;
- queue_work(cifsoplockd_wq, &cfile->oplock_break);
+ set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+
+ /*
+ * Set or clear flags depending on the lease state being READ.
+ * HANDLE caching flag should be added when the client starts
+ * to defer closing remote file handles with HANDLE leases.
+ */
+ if (lease_state & SMB2_LEASE_READ_CACHING_HE)
+ set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
+ else
+ clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
+
+ cifs_queue_oplock_break(cfile);
kfree(lw);
return true;
}
@@ -648,13 +659,6 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
return false;
- if (rsp->sync_hdr.CreditRequest) {
- spin_lock(&server->req_lock);
- server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest);
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
- }
-
if (rsp->StructureSize !=
smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
if (le16_to_cpu(rsp->StructureSize) == 44)
@@ -708,8 +712,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&cinode->flags);
spin_unlock(&cfile->file_info_lock);
- queue_work(cifsoplockd_wq,
- &cfile->oplock_break);
+
+ cifs_queue_oplock_break(cfile);
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 6f96e2292856..c36ff0d1fe2a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -67,10 +67,13 @@ change_conf(struct TCP_Server_Info *server)
}
static void
-smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+smb2_add_credits(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
int *val, rc = -1;
+ unsigned int add = credits->value;
+ unsigned int instance = credits->instance;
+ bool reconnect_detected = false;
spin_lock(&server->req_lock);
val = server->ops->get_credits_field(server, optype);
@@ -79,8 +82,11 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0))
trace_smb3_reconnect_with_invalid_credits(server->CurrentMid,
server->hostname, *val);
+ if ((instance == 0) || (instance == server->reconnect_instance))
+ *val += add;
+ else
+ reconnect_detected = true;
- *val += add;
if (*val > 65000) {
*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
printk_once(KERN_WARNING "server overflowed SMB3 credits\n");
@@ -102,7 +108,12 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- if (server->tcpStatus == CifsNeedReconnect)
+ if (reconnect_detected)
+ cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n",
+ add, instance);
+
+ if (server->tcpStatus == CifsNeedReconnect
+ || server->tcpStatus == CifsExiting)
return;
switch (rc) {
@@ -163,7 +174,7 @@ smb2_get_credits(struct mid_q_entry *mid)
static int
smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, unsigned int *credits)
+ unsigned int *num, struct cifs_credits *credits)
{
int rc = 0;
unsigned int scredits;
@@ -174,7 +185,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
spin_unlock(&server->req_lock);
cifs_num_waiters_inc(server);
rc = wait_event_killable(server->request_q,
- has_credits(server, &server->credits));
+ has_credits(server, &server->credits, 1));
cifs_num_waiters_dec(server);
if (rc)
return rc;
@@ -189,7 +200,8 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
/* can deadlock with reopen */
if (scredits <= 8) {
*num = SMB2_MAX_BUFFER_SIZE;
- *credits = 0;
+ credits->value = 0;
+ credits->instance = 0;
break;
}
@@ -198,8 +210,10 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
*num = min_t(unsigned int, size,
scredits * SMB2_MAX_BUFFER_SIZE);
- *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
- server->credits -= *credits;
+ credits->value =
+ DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
+ credits->instance = server->reconnect_instance;
+ server->credits -= credits->value;
server->in_flight++;
break;
}
@@ -208,6 +222,38 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
return rc;
}
+static int
+smb2_adjust_credits(struct TCP_Server_Info *server,
+ struct cifs_credits *credits,
+ const unsigned int payload_size)
+{
+ int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE);
+
+ if (!credits->value || credits->value == new_val)
+ return 0;
+
+ if (credits->value < new_val) {
+ WARN_ONCE(1, "request has less credits (%d) than required (%d)",
+ credits->value, new_val);
+ return -ENOTSUPP;
+ }
+
+ spin_lock(&server->req_lock);
+
+ if (server->reconnect_instance != credits->instance) {
+ spin_unlock(&server->req_lock);
+ cifs_dbg(VFS, "trying to return %d credits to old session\n",
+ credits->value - new_val);
+ return -EAGAIN;
+ }
+
+ server->credits += credits->value - new_val;
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ credits->value = new_val;
+ return 0;
+}
+
static __u64
smb2_get_next_mid(struct TCP_Server_Info *server)
{
@@ -219,6 +265,15 @@ smb2_get_next_mid(struct TCP_Server_Info *server)
return mid;
}
+static void
+smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+ spin_lock(&GlobalMid_Lock);
+ if (server->CurrentMid >= val)
+ server->CurrentMid -= val;
+ spin_unlock(&GlobalMid_Lock);
+}
+
static struct mid_q_entry *
smb2_find_mid(struct TCP_Server_Info *server, char *buf)
{
@@ -526,7 +581,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
NULL /* no data input */, 0 /* no data input */,
- (char **)&out_buf, &ret_data_len);
+ CIFSMaxBufSize, (char **)&out_buf, &ret_data_len);
if (rc == -EOPNOTSUPP) {
cifs_dbg(FYI,
"server does not support query network interfaces\n");
@@ -564,6 +619,7 @@ smb2_close_cached_fid(struct kref *ref)
SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid,
cfid->fid->volatile_fid);
cfid->is_valid = false;
+ cfid->file_all_info_is_valid = false;
}
}
@@ -588,9 +644,18 @@ smb2_cached_lease_break(struct work_struct *work)
*/
int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
{
- struct cifs_open_parms oparams;
- int rc;
- __le16 srch_path = 0; /* Null - since an open of top of share */
+ struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = ses->server;
+ struct cifs_open_parms oparms;
+ struct smb2_create_rsp *o_rsp = NULL;
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ int resp_buftype[2];
+ struct smb_rqst rqst[2];
+ struct kvec rsp_iov[2];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct kvec qi_iov[1];
+ int rc, flags = 0;
+ __le16 utf16_path = 0; /* Null - since an open of top of share */
u8 oplock = SMB2_OPLOCK_LEVEL_II;
mutex_lock(&tcon->crfid.fid_mutex);
@@ -602,22 +667,85 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
return 0;
}
- oparams.tcon = tcon;
- oparams.create_options = 0;
- oparams.desired_access = FILE_READ_ATTRIBUTES;
- oparams.disposition = FILE_OPEN;
- oparams.fid = pfid;
- oparams.reconnect = false;
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ /* Open */
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+ oparms.tcon = tcon;
+ oparms.create_options = 0;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.fid = pfid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &utf16_path);
+ if (rc)
+ goto oshr_exit;
+ smb2_set_next_command(tcon, &rqst[0]);
+
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID,
+ COMPOUND_FID, FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ if (rc)
+ goto oshr_exit;
- rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL);
- if (rc == 0) {
- memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid));
- tcon->crfid.tcon = tcon;
- tcon->crfid.is_valid = true;
- kref_init(&tcon->crfid.refcount);
+ smb2_set_related(&rqst[1]);
+
+ rc = compound_send_recv(xid, ses, flags, 2, rqst,
+ resp_buftype, rsp_iov);
+ if (rc)
+ goto oshr_exit;
+
+ o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
+ oparms.fid->persistent_fid = o_rsp->PersistentFileId;
+ oparms.fid->volatile_fid = o_rsp->VolatileFileId;
+#ifdef CONFIG_CIFS_DEBUG2
+ oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId);
+#endif /* CIFS_DEBUG2 */
+
+ memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid));
+ tcon->crfid.tcon = tcon;
+ tcon->crfid.is_valid = true;
+ kref_init(&tcon->crfid.refcount);
+
+ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
kref_get(&tcon->crfid.refcount);
- }
+ oplock = smb2_parse_lease_state(server, o_rsp,
+ &oparms.fid->epoch,
+ oparms.fid->lease_key);
+ } else
+ goto oshr_exit;
+
+ qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
+ goto oshr_exit;
+ if (!smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ sizeof(struct smb2_file_all_info),
+ &rsp_iov[1], sizeof(struct smb2_file_all_info),
+ (char *)&tcon->crfid.file_all_info))
+ tcon->crfid.file_all_info_is_valid = 1;
+
+ oshr_exit:
mutex_unlock(&tcon->crfid.fid_mutex);
+ SMB2_open_free(&rqst[0]);
+ SMB2_query_info_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
return rc;
}
@@ -940,6 +1068,16 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
+ if (ses->server->ops->query_all_EAs) {
+ if (!ea_value) {
+ rc = ses->server->ops->query_all_EAs(xid, tcon, path,
+ ea_name, NULL, 0,
+ cifs_sb);
+ if (rc == -ENODATA)
+ goto sea_exit;
+ }
+ }
+
/* Open */
memset(&open_iov, 0, sizeof(open_iov));
rqst[0].rq_iov = open_iov;
@@ -1157,7 +1295,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */,
- NULL, 0 /* no input */,
+ NULL, 0 /* no input */, CIFSMaxBufSize,
(char **)&res_key, &ret_data_len);
if (rc) {
@@ -1188,7 +1326,8 @@ smb2_ioctl_query_info(const unsigned int xid,
struct smb_query_info __user *pqi;
int rc = 0;
int flags = 0;
- struct smb2_query_info_rsp *rsp = NULL;
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ struct smb2_ioctl_rsp *io_rsp = NULL;
void *buffer = NULL;
struct smb_rqst rqst[3];
int resp_buftype[3];
@@ -1198,6 +1337,7 @@ smb2_ioctl_query_info(const unsigned int xid,
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_fid fid;
struct kvec qi_iov[1];
+ struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
struct kvec close_iov[1];
memset(rqst, 0, sizeof(rqst));
@@ -1248,15 +1388,35 @@ smb2_ioctl_query_info(const unsigned int xid,
smb2_set_next_command(tcon, &rqst[0]);
/* Query */
- memset(&qi_iov, 0, sizeof(qi_iov));
- rqst[1].rq_iov = qi_iov;
- rqst[1].rq_nvec = 1;
-
- rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID,
- qi.file_info_class, qi.info_type,
- qi.additional_information,
+ if (qi.flags & PASSTHRU_FSCTL) {
+ /* Can eventually relax perm check since server enforces too */
+ if (!capable(CAP_SYS_ADMIN))
+ rc = -EPERM;
+ else {
+ memset(&io_iov, 0, sizeof(io_iov));
+ rqst[1].rq_iov = io_iov;
+ rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, &rqst[1],
+ COMPOUND_FID, COMPOUND_FID,
+ qi.info_type, true, NULL,
+ 0, CIFSMaxBufSize);
+ }
+ } else if (qi.flags == PASSTHRU_QUERY_INFO) {
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID,
+ COMPOUND_FID, qi.file_info_class,
+ qi.info_type, qi.additional_information,
qi.input_buffer_length,
qi.output_buffer_length, buffer);
+ } else { /* unknown flags */
+ cifs_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags);
+ rc = -EINVAL;
+ }
+
if (rc)
goto iqinf_exit;
smb2_set_next_command(tcon, &rqst[1]);
@@ -1276,24 +1436,44 @@ smb2_ioctl_query_info(const unsigned int xid,
resp_buftype, rsp_iov);
if (rc)
goto iqinf_exit;
- pqi = (struct smb_query_info __user *)arg;
- rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
- if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length)
- qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength);
- if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
- sizeof(qi.input_buffer_length))) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
- if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) {
- rc = -EFAULT;
- goto iqinf_exit;
+ if (qi.flags & PASSTHRU_FSCTL) {
+ pqi = (struct smb_query_info __user *)arg;
+ io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length)
+ qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount);
+ if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ if (copy_to_user(pqi + 1, &io_rsp[1], qi.input_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ } else {
+ pqi = (struct smb_query_info __user *)arg;
+ qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length)
+ qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength);
+ if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
}
iqinf_exit:
kfree(buffer);
SMB2_open_free(&rqst[0]);
- SMB2_query_info_free(&rqst[1]);
+ if (qi.flags & PASSTHRU_FSCTL)
+ SMB2_ioctl_free(&rqst[1]);
+ else
+ SMB2_query_info_free(&rqst[1]);
+
SMB2_close_free(&rqst[2]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
@@ -1348,8 +1528,8 @@ smb2_copychunk_range(const unsigned int xid,
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
true /* is_fsctl */, (char *)pcchunk,
- sizeof(struct copychunk_ioctl), (char **)&retbuf,
- &ret_data_len);
+ sizeof(struct copychunk_ioctl), CIFSMaxBufSize,
+ (char **)&retbuf, &ret_data_len);
if (rc == 0) {
if (ret_data_len !=
sizeof(struct copychunk_ioctl_rsp)) {
@@ -1509,7 +1689,7 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
true /* is_fctl */,
- &setsparse, 1, NULL, NULL);
+ &setsparse, 1, CIFSMaxBufSize, NULL, NULL);
if (rc) {
tcon->broken_sparse_sup = true;
cifs_dbg(FYI, "set sparse rc = %d\n", rc);
@@ -1582,7 +1762,7 @@ smb2_duplicate_extents(const unsigned int xid,
true /* is_fsctl */,
(char *)&dup_ext_buf,
sizeof(struct duplicate_extents_to_file),
- NULL,
+ CIFSMaxBufSize, NULL,
&ret_data_len);
if (ret_data_len > 0)
@@ -1617,7 +1797,7 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon,
true /* is_fsctl */,
(char *)&integr_info,
sizeof(struct fsctl_set_integrity_information_req),
- NULL,
+ CIFSMaxBufSize, NULL,
&ret_data_len);
}
@@ -1625,6 +1805,8 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon,
/* GMT Token is @GMT-YYYY.MM.DD-HH.MM.SS Unicode which is 48 bytes + null */
#define GMT_TOKEN_SIZE 50
+#define MIN_SNAPSHOT_ARRAY_SIZE 16 /* See MS-SMB2 section 3.3.5.15.1 */
+
/*
* Input buffer contains (empty) struct smb_snapshot array with size filled in
* For output see struct SRV_SNAPSHOT_ARRAY in MS-SMB2 section 2.2.32.2
@@ -1636,13 +1818,29 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon,
char *retbuf = NULL;
unsigned int ret_data_len = 0;
int rc;
+ u32 max_response_size;
struct smb_snapshot_array snapshot_in;
+ if (get_user(ret_data_len, (unsigned int __user *)ioc_buf))
+ return -EFAULT;
+
+ /*
+ * Note that for snapshot queries that servers like Azure expect that
+ * the first query be minimal size (and just used to get the number/size
+ * of previous versions) so response size must be specified as EXACTLY
+ * sizeof(struct snapshot_array) which is 16 when rounded up to multiple
+ * of eight bytes.
+ */
+ if (ret_data_len == 0)
+ max_response_size = MIN_SNAPSHOT_ARRAY_SIZE;
+ else
+ max_response_size = CIFSMaxBufSize;
+
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
FSCTL_SRV_ENUMERATE_SNAPSHOTS,
true /* is_fsctl */,
- NULL, 0 /* no input data */,
+ NULL, 0 /* no input data */, max_response_size,
(char **)&retbuf,
&ret_data_len);
cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n",
@@ -1753,14 +1951,14 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
* the number of credits and return true. Otherwise - return false.
*/
static bool
-smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
+smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
{
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
if (shdr->Status != STATUS_PENDING)
return false;
- if (!length) {
+ if (shdr->CreditRequest) {
spin_lock(&server->req_lock);
server->credits += le16_to_cpu(shdr->CreditRequest);
spin_unlock(&server->req_lock);
@@ -2120,7 +2318,7 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_DFS_GET_REFERRALS,
true /* is_fsctl */,
- (char *)dfs_req, dfs_req_size,
+ (char *)dfs_req, dfs_req_size, CIFSMaxBufSize,
(char **)&dfs_rsp, &dfs_rsp_size);
} while (rc == -EAGAIN);
@@ -2191,6 +2389,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov,
&resp_buftype);
+ if (!rc)
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
if (!rc || !err_iov.iov_base) {
rc = -ENOENT;
goto free_path;
@@ -2407,22 +2607,38 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb,
static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len, bool keep_size)
{
+ struct cifs_ses *ses = tcon->ses;
struct inode *inode;
struct cifsInodeInfo *cifsi;
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
+ struct smb_rqst rqst[2];
+ int resp_buftype[2];
+ struct kvec rsp_iov[2];
+ struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
+ struct kvec si_iov[1];
+ unsigned int size[1];
+ void *data[1];
long rc;
unsigned int xid;
+ int num = 0, flags = 0;
+ __le64 eof;
xid = get_xid();
inode = d_inode(cfile->dentry);
cifsi = CIFS_I(inode);
+ trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len);
+
+
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
if (keep_size == false) {
rc = -EOPNOTSUPP;
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, ses->Suid, offset, len, rc);
free_xid(xid);
return rc;
}
@@ -2433,33 +2649,74 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
*/
if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) {
rc = -EOPNOTSUPP;
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len, rc);
free_xid(xid);
return rc;
}
- /*
- * need to make sure we are not asked to extend the file since the SMB3
- * fsctl does not change the file size. In the future we could change
- * this to zero the first part of the range then set the file size
- * which for a non sparse file would zero the newly extended range
- */
- if (keep_size == false)
- if (i_size_read(inode) < offset + len) {
- rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
- }
-
cifs_dbg(FYI, "offset %lld len %lld", offset, len);
fsctl_buf.FileOffset = cpu_to_le64(offset);
fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
- rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
- true /* is_fctl */, (char *)&fsctl_buf,
- sizeof(struct file_zero_data_information), NULL, NULL);
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+
+ memset(&io_iov, 0, sizeof(io_iov));
+ rqst[num].rq_iov = io_iov;
+ rqst[num].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+ rc = SMB2_ioctl_init(tcon, &rqst[num++], cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+ true /* is_fctl */, (char *)&fsctl_buf,
+ sizeof(struct file_zero_data_information),
+ CIFSMaxBufSize);
+ if (rc)
+ goto zero_range_exit;
+
+ /*
+ * do we also need to change the size of the file?
+ */
+ if (keep_size == false && i_size_read(inode) < offset + len) {
+ smb2_set_next_command(tcon, &rqst[0]);
+
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num].rq_iov = si_iov;
+ rqst[num].rq_nvec = 1;
+
+ eof = cpu_to_le64(offset + len);
+ size[0] = 8; /* sizeof __le64 */
+ data[0] = &eof;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num++],
+ cfile->fid.persistent_fid,
+ cfile->fid.persistent_fid,
+ current->tgid,
+ FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_related(&rqst[1]);
+ }
+
+ rc = compound_send_recv(xid, ses, flags, num, rqst,
+ resp_buftype, rsp_iov);
+
+ zero_range_exit:
+ SMB2_ioctl_free(&rqst[0]);
+ SMB2_set_info_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_xid(xid);
+ if (rc)
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len, rc);
+ else
+ trace_smb3_zero_done(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len);
return rc;
}
@@ -2495,7 +2752,8 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
true /* is_fctl */, (char *)&fsctl_buf,
- sizeof(struct file_zero_data_information), NULL, NULL);
+ sizeof(struct file_zero_data_information),
+ CIFSMaxBufSize, NULL, NULL);
free_xid(xid);
return rc;
}
@@ -2508,15 +2766,20 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile = file->private_data;
long rc = -EOPNOTSUPP;
unsigned int xid;
+ __le64 eof;
xid = get_xid();
inode = d_inode(cfile->dentry);
cifsi = CIFS_I(inode);
+ trace_smb3_falloc_enter(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len);
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
if (keep_size == false) {
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
free_xid(xid);
return rc;
}
@@ -2536,6 +2799,12 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
/* BB: in future add else clause to extend file */
else
rc = -EOPNOTSUPP;
+ if (rc)
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
+ else
+ trace_smb3_falloc_done(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len);
free_xid(xid);
return rc;
}
@@ -2551,14 +2820,31 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
*/
if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) {
rc = -EOPNOTSUPP;
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
free_xid(xid);
return rc;
}
- rc = smb2_set_sparse(xid, tcon, cfile, inode, false);
+ smb2_set_sparse(xid, tcon, cfile, inode, false);
+ rc = 0;
+ } else {
+ smb2_set_sparse(xid, tcon, cfile, inode, false);
+ rc = 0;
+ if (i_size_read(inode) < off + len) {
+ eof = cpu_to_le64(off + len);
+ rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, cfile->pid,
+ &eof);
+ }
}
- /* BB: else ... in future add code to extend file and set sparse */
+ if (rc)
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len, rc);
+ else
+ trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len);
free_xid(xid);
return rc;
@@ -2595,6 +2881,15 @@ smb2_downgrade_oplock(struct TCP_Server_Info *server,
}
static void
+smb21_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, bool set_level2)
+{
+ server->ops->set_oplock_level(cinode,
+ set_level2 ? SMB2_LEASE_READ_CACHING_HE :
+ 0, 0, NULL);
+}
+
+static void
smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int epoch, bool *purge_cache)
{
@@ -3210,15 +3505,15 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, 0))
+ server->ops->is_status_pending(buf, server))
return -1;
/* set up first two iov to get credits */
rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = 4;
- rdata->iov[1].iov_base = buf + 4;
+ rdata->iov[0].iov_len = 0;
+ rdata->iov[1].iov_base = buf;
rdata->iov[1].iov_len =
- min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4;
+ min_t(unsigned int, buf_len, server->vals->read_rsp_size);
cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
rdata->iov[0].iov_base, rdata->iov[0].iov_len);
cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
@@ -3530,6 +3825,104 @@ smb2_next_header(char *buf)
return le32_to_cpu(hdr->NextCommand);
}
+static int
+smb2_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ int rc = -EPERM;
+ int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
+ FILE_ALL_INFO *buf = NULL;
+ struct cifs_io_parms io_parms;
+ __u32 oplock = 0;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ unsigned int bytes_written;
+ struct win_dev *pdev;
+ struct kvec iov[2];
+
+ /*
+ * Check if mounted with mount parm 'sfu' mount parm.
+ * SFU emulation should work with all servers, but only
+ * supports block and char device (no socket & fifo),
+ * and was used by default in earlier versions of Windows
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+ goto out;
+
+ /*
+ * TODO: Add ability to create instead via reparse point. Windows (e.g.
+ * their current NFS server) uses this approach to expose special files
+ * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
+ */
+
+ if (!S_ISCHR(mode) && !S_ISBLK(mode))
+ goto out;
+
+ cifs_dbg(FYI, "sfu compat create special file\n");
+
+ buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+ if (buf == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_WRITE;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_CREATE;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ if (tcon->ses->server->oplocks)
+ oplock = REQ_OPLOCK;
+ else
+ oplock = 0;
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ if (rc)
+ goto out;
+
+ /*
+ * BB Do not bother to decode buf since no local inode yet to put
+ * timestamps in, but we can reuse it safely.
+ */
+
+ pdev = (struct win_dev *)buf;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = sizeof(struct win_dev);
+ iov[1].iov_base = buf;
+ iov[1].iov_len = sizeof(struct win_dev);
+ if (S_ISCHR(mode)) {
+ memcpy(pdev->type, "IntxCHR", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ } else if (S_ISBLK(mode)) {
+ memcpy(pdev->type, "IntxBLK", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ }
+ tcon->ses->server->ops->close(xid, tcon, &fid);
+ d_drop(dentry);
+
+ /* FIXME: add code here to set EAs */
+out:
+ kfree(buf);
+ return rc;
+}
+
+
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@ -3541,6 +3934,7 @@ struct smb_version_operations smb20_operations = {
.get_credits = smb2_get_credits,
.wait_mtu_credits = cifs_wait_mtu_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3623,6 +4017,7 @@ struct smb_version_operations smb20_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb21_operations = {
@@ -3635,7 +4030,9 @@ struct smb_version_operations smb21_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3646,7 +4043,7 @@ struct smb_version_operations smb21_operations = {
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -3719,6 +4116,7 @@ struct smb_version_operations smb21_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb30_operations = {
@@ -3731,7 +4129,9 @@ struct smb_version_operations smb30_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3743,7 +4143,7 @@ struct smb_version_operations smb30_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
@@ -3824,6 +4224,7 @@ struct smb_version_operations smb30_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb311_operations = {
@@ -3836,7 +4237,9 @@ struct smb_version_operations smb311_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3848,7 +4251,7 @@ struct smb_version_operations smb311_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
@@ -3930,6 +4333,7 @@ struct smb_version_operations smb311_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 77b3aaa39b35..a37774a55f3a 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -107,13 +107,13 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
struct TCP_Server_Info *server = tcon->ses->server;
spin_lock(&server->req_lock);
- /* Request up to 2 credits but don't go over the limit. */
+ /* Request up to 10 credits but don't go over the limit. */
if (server->credits >= server->max_credits)
shdr->CreditRequest = cpu_to_le16(0);
else
shdr->CreditRequest = cpu_to_le16(
min_t(int, server->max_credits -
- server->credits, 2));
+ server->credits, 10));
spin_unlock(&server->req_lock);
} else {
shdr->CreditRequest = cpu_to_le16(2);
@@ -173,8 +173,8 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
return -ENOMEM;
if (tcon->ipc) {
- snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
- tcon->ses->server->hostname);
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+ tcon->ses->server->hostname);
rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
goto out;
}
@@ -206,7 +206,7 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
continue;
}
- snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
if (!rc)
@@ -490,6 +490,23 @@ build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt)
{
pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE;
pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
+ /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */
+ pneg_ctxt->Name[0] = 0x93;
+ pneg_ctxt->Name[1] = 0xAD;
+ pneg_ctxt->Name[2] = 0x25;
+ pneg_ctxt->Name[3] = 0x50;
+ pneg_ctxt->Name[4] = 0x9C;
+ pneg_ctxt->Name[5] = 0xB4;
+ pneg_ctxt->Name[6] = 0x11;
+ pneg_ctxt->Name[7] = 0xE7;
+ pneg_ctxt->Name[8] = 0xB4;
+ pneg_ctxt->Name[9] = 0x23;
+ pneg_ctxt->Name[10] = 0x83;
+ pneg_ctxt->Name[11] = 0xDE;
+ pneg_ctxt->Name[12] = 0x96;
+ pneg_ctxt->Name[13] = 0x8B;
+ pneg_ctxt->Name[14] = 0xCD;
+ pneg_ctxt->Name[15] = 0x7C;
}
static void
@@ -815,8 +832,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
} else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) {
/* ops set to 3.0 by default for default so update */
ses->server->ops = &smb21_operations;
- } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID))
+ ses->server->vals = &smb21_values;
+ } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) {
ses->server->ops = &smb311_operations;
+ ses->server->vals = &smb311_values;
+ }
} else if (le16_to_cpu(rsp->DialectRevision) !=
ses->server->vals->protocol_id) {
/* if requested single dialect ensure returned dialect matched */
@@ -985,9 +1005,16 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
- (char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen);
-
- if (rc != 0) {
+ (char *)pneg_inbuf, inbuflen, CIFSMaxBufSize,
+ (char **)&pneg_rsp, &rsplen);
+ if (rc == -EOPNOTSUPP) {
+ /*
+ * Old Windows versions or Netapp SMB server can return
+ * not supported error. Client should accept it.
+ */
+ cifs_dbg(VFS, "Server does not support validate negotiate\n");
+ return 0;
+ } else if (rc != 0) {
cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
rc = -EIO;
goto out_free_inbuf;
@@ -1605,15 +1632,25 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
iov[1].iov_base = unc_path;
iov[1].iov_len = unc_path_len;
- /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */
+ /*
+ * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1
+ * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1
+ * (Samba servers don't always set the flag so also check if null user)
+ */
if ((ses->server->dialect == SMB311_PROT_ID) &&
- !smb3_encryption_required(tcon))
+ !smb3_encryption_required(tcon) &&
+ !(ses->session_flags &
+ (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) &&
+ ((ses->user_name != NULL) || (ses->sectype == Kerberos)))
req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
memset(&rqst, 0, sizeof(struct smb_rqst));
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
+ /* Need 64 for max size write so ask for more in case not there yet */
+ req->sync_hdr.CreditRequest = cpu_to_le16(64);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base;
@@ -1771,9 +1808,10 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
return buf;
}
-static __u8
-parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
- unsigned int *epoch, char *lease_key)
+__u8
+smb2_parse_lease_state(struct TCP_Server_Info *server,
+ struct smb2_create_rsp *rsp,
+ unsigned int *epoch, char *lease_key)
{
char *data_offset;
struct create_context *cc;
@@ -1824,8 +1862,9 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
}
static struct create_durable_v2 *
-create_durable_v2_buf(struct cifs_fid *pfid)
+create_durable_v2_buf(struct cifs_open_parms *oparms)
{
+ struct cifs_fid *pfid = oparms->fid;
struct create_durable_v2 *buf;
buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL);
@@ -1839,7 +1878,14 @@ create_durable_v2_buf(struct cifs_fid *pfid)
(struct create_durable_v2, Name));
buf->ccontext.NameLength = cpu_to_le16(4);
- buf->dcontext.Timeout = 0; /* Should this be configurable by workload */
+ /*
+ * NB: Handle timeout defaults to 0, which allows server to choose
+ * (most servers default to 120 seconds) and most clients default to 0.
+ * This can be overridden at mount ("handletimeout=") if the user wants
+ * a different persistent (or resilient) handle timeout for all opens
+ * opens on a particular SMB3 mount.
+ */
+ buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout);
buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
generate_random_uuid(buf->dcontext.CreateGuid);
memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
@@ -1892,7 +1938,7 @@ add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec,
struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
- iov[num].iov_base = create_durable_v2_buf(oparms->fid);
+ iov[num].iov_base = create_durable_v2_buf(oparms);
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_durable_v2);
@@ -2170,6 +2216,8 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
rqst.rq_iov = iov;
rqst.rq_nvec = n_iov;
+ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE,
+ FILE_WRITE_ATTRIBUTES);
/* resource #4: response buffer */
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
if (rc) {
@@ -2388,6 +2436,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
if (rc)
goto creat_exit;
+ trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid,
+ oparms->create_options, oparms->desired_access);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
&rsp_iov);
rsp = (struct smb2_create_rsp *)rsp_iov.iov_base;
@@ -2425,8 +2476,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
}
if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
- *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch,
- oparms->fid->lease_key);
+ *oplock = smb2_parse_lease_state(server, rsp,
+ &oparms->fid->epoch,
+ oparms->fid->lease_key);
else
*oplock = rsp->OplockLevel;
creat_exit:
@@ -2435,113 +2487,138 @@ creat_exit:
return rc;
}
-/*
- * SMB2 IOCTL is used for both IOCTLs and FSCTLs
- */
int
-SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 opcode, bool is_fsctl,
- char *in_data, u32 indatalen,
- char **out_data, u32 *plen /* returned data len */)
+SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid, u32 opcode,
+ bool is_fsctl, char *in_data, u32 indatalen,
+ __u32 max_response_size)
{
- struct smb_rqst rqst;
struct smb2_ioctl_req *req;
- struct smb2_ioctl_rsp *rsp;
- struct cifs_ses *ses;
- struct kvec iov[2];
- struct kvec rsp_iov;
- int resp_buftype;
- int n_iov;
- int rc = 0;
- int flags = 0;
+ struct kvec *iov = rqst->rq_iov;
unsigned int total_len;
-
- cifs_dbg(FYI, "SMB2 IOCTL\n");
-
- if (out_data != NULL)
- *out_data = NULL;
-
- /* zero out returned data len, in case of error */
- if (plen)
- *plen = 0;
-
- if (tcon)
- ses = tcon->ses;
- else
- return -EIO;
-
- if (!ses || !(ses->server))
- return -EIO;
+ int rc;
rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len);
if (rc)
return rc;
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
req->CtlCode = cpu_to_le32(opcode);
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
+ iov[0].iov_base = (char *)req;
+ /*
+ * If no input data, the size of ioctl struct in
+ * protocol spec still includes a 1 byte data buffer,
+ * but if input data passed to ioctl, we do not
+ * want to double count this, so we do not send
+ * the dummy one byte of data in iovec[0] if sending
+ * input data (in iovec[1]).
+ */
if (indatalen) {
req->InputCount = cpu_to_le32(indatalen);
/* do not set InputOffset if no input data */
req->InputOffset =
cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer));
+ rqst->rq_nvec = 2;
+ iov[0].iov_len = total_len - 1;
iov[1].iov_base = in_data;
iov[1].iov_len = indatalen;
- n_iov = 2;
- } else
- n_iov = 1;
+ } else {
+ rqst->rq_nvec = 1;
+ iov[0].iov_len = total_len;
+ }
req->OutputOffset = 0;
req->OutputCount = 0; /* MBZ */
/*
- * Could increase MaxOutputResponse, but that would require more
- * than one credit. Windows typically sets this smaller, but for some
+ * In most cases max_response_size is set to 16K (CIFSMaxBufSize)
+ * We Could increase default MaxOutputResponse, but that could require
+ * more credits. Windows typically sets this smaller, but for some
* ioctls it may be useful to allow server to send more. No point
* limiting what the server can send as long as fits in one credit
- * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE
- * (by default, note that it can be overridden to make max larger)
- * in responses (except for read responses which can be bigger.
- * We may want to bump this limit up
+ * We can not handle more than CIFS_MAX_BUF_SIZE yet but may want
+ * to increase this limit up in the future.
+ * Note that for snapshot queries that servers like Azure expect that
+ * the first query be minimal size (and just used to get the number/size
+ * of previous versions) so response size must be specified as EXACTLY
+ * sizeof(struct snapshot_array) which is 16 when rounded up to multiple
+ * of eight bytes. Currently that is the only case where we set max
+ * response size smaller.
*/
- req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize);
+ req->MaxOutputResponse = cpu_to_le32(max_response_size);
if (is_fsctl)
req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
else
req->Flags = 0;
- iov[0].iov_base = (char *)req;
-
- /*
- * If no input data, the size of ioctl struct in
- * protocol spec still includes a 1 byte data buffer,
- * but if input data passed to ioctl, we do not
- * want to double count this, so we do not send
- * the dummy one byte of data in iovec[0] if sending
- * input data (in iovec[1]).
- */
-
- if (indatalen) {
- iov[0].iov_len = total_len - 1;
- } else
- iov[0].iov_len = total_len;
-
/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ return 0;
+}
+
+void
+SMB2_ioctl_free(struct smb_rqst *rqst)
+{
+ if (rqst && rqst->rq_iov)
+ cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+}
+
+
+/*
+ * SMB2 IOCTL is used for both IOCTLs and FSCTLs
+ */
+int
+SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+ u64 volatile_fid, u32 opcode, bool is_fsctl,
+ char *in_data, u32 indatalen, u32 max_out_data_len,
+ char **out_data, u32 *plen /* returned data len */)
+{
+ struct smb_rqst rqst;
+ struct smb2_ioctl_rsp *rsp = NULL;
+ struct cifs_ses *ses;
+ struct kvec iov[SMB2_IOCTL_IOV_SIZE];
+ struct kvec rsp_iov = {NULL, 0};
+ int resp_buftype = CIFS_NO_BUFFER;
+ int rc = 0;
+ int flags = 0;
+
+ cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+ if (out_data != NULL)
+ *out_data = NULL;
+
+ /* zero out returned data len, in case of error */
+ if (plen)
+ *plen = 0;
+
+ if (tcon)
+ ses = tcon->ses;
+ else
+ return -EIO;
+
+ if (!ses || !(ses->server))
+ return -EIO;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
memset(&rqst, 0, sizeof(struct smb_rqst));
+ memset(&iov, 0, sizeof(iov));
rqst.rq_iov = iov;
- rqst.rq_nvec = n_iov;
+ rqst.rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid, opcode,
+ is_fsctl, in_data, indatalen, max_out_data_len);
+ if (rc)
+ goto ioctl_exit;
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
&rsp_iov);
- cifs_small_buf_release(req);
rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base;
if (rc != 0)
@@ -2591,6 +2668,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
}
ioctl_exit:
+ SMB2_ioctl_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
return rc;
}
@@ -2613,7 +2691,8 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
FSCTL_SET_COMPRESSION, true /* is_fsctl */,
(char *)&fsctl_input /* data input */,
- 2 /* in data len */, &ret_data /* out data */, NULL);
+ 2 /* in data len */, CIFSMaxBufSize /* max out data */,
+ &ret_data /* out data */, NULL);
cifs_dbg(FYI, "set compression rc %d\n", rc);
@@ -2837,6 +2916,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto qinf_exit;
+ trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
@@ -2847,6 +2929,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
goto qinf_exit;
}
+ trace_smb3_query_info_done(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type);
+
if (dlen) {
*dlen = le32_to_cpu(rsp->OutputBufferLength);
if (!*data) {
@@ -2924,14 +3009,16 @@ smb2_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf;
- unsigned int credits_received = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
if (mid->mid_state == MID_RESPONSE_RECEIVED
- || mid->mid_state == MID_RESPONSE_MALFORMED)
- credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ || mid->mid_state == MID_RESPONSE_MALFORMED) {
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
+ }
DeleteMidQEntry(mid);
- add_credits(server, credits_received, CIFS_ECHO_OP);
+ add_credits(server, &credits, CIFS_ECHO_OP);
}
void smb2_reconnect_server(struct work_struct *work)
@@ -3023,7 +3110,7 @@ SMB2_echo(struct TCP_Server_Info *server)
iov[0].iov_base = (char *)req;
rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL,
- server, CIFS_ECHO_OP);
+ server, CIFS_ECHO_OP, NULL);
if (rc)
cifs_dbg(FYI, "Echo request failed: %d\n", rc);
@@ -3114,6 +3201,11 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->MinimumCount = 0;
req->Length = cpu_to_le32(io_parms->length);
req->Offset = cpu_to_le64(io_parms->offset);
+
+ trace_smb3_read_enter(0 /* xid */,
+ io_parms->persistent_fid,
+ io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length);
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a RDMA write, fill in and append
@@ -3184,7 +3276,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *server = tcon->ses->server;
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)rdata->iov[0].iov_base;
- unsigned int credits_received = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
.rq_pages = rdata->pages,
@@ -3199,7 +3291,8 @@ smb2_readv_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits_received = le16_to_cpu(shdr->CreditRequest);
+ credits.value = le16_to_cpu(shdr->CreditRequest);
+ credits.instance = server->reconnect_instance;
/* result already set, check signature */
if (server->sign && !mid->decrypted) {
int rc;
@@ -3224,11 +3317,11 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_RESPONSE_MALFORMED:
- credits_received = le16_to_cpu(shdr->CreditRequest);
+ credits.value = le16_to_cpu(shdr->CreditRequest);
+ credits.instance = server->reconnect_instance;
/* fall through */
default:
- if (rdata->result != -ENODATA)
- rdata->result = -EIO;
+ rdata->result = -EIO;
}
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
@@ -3255,7 +3348,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &rdata->work);
DeleteMidQEntry(mid);
- add_credits(server, credits_received, 0);
+ add_credits(server, &credits, 0);
}
/* smb2_async_readv - send an async read, and set up mid to handle result */
@@ -3285,17 +3378,8 @@ smb2_async_readv(struct cifs_readdata *rdata)
rc = smb2_new_read_req(
(void **) &buf, &total_len, &io_parms, rdata, 0, 0);
- if (rc) {
- if (rc == -EAGAIN && rdata->credits) {
- /* credits was reset by reconnect */
- rdata->credits = 0;
- /* reduce in_flight value since we won't send the req */
- spin_lock(&server->req_lock);
- server->in_flight--;
- spin_unlock(&server->req_lock);
- }
+ if (rc)
return rc;
- }
if (smb3_encryption_required(io_parms.tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -3305,24 +3389,24 @@ smb2_async_readv(struct cifs_readdata *rdata)
shdr = (struct smb2_sync_hdr *)buf;
- if (rdata->credits) {
+ if (rdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
SMB2_MAX_BUFFER_SIZE));
shdr->CreditRequest =
cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
- spin_lock(&server->req_lock);
- server->credits += rdata->credits -
- le16_to_cpu(shdr->CreditCharge);
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
- rdata->credits = le16_to_cpu(shdr->CreditCharge);
+
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+ if (rc)
+ goto async_readv_out;
+
flags |= CIFS_HAS_CREDITS;
}
kref_get(&rdata->refcount);
rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
cifs_readv_receive, smb2_readv_callback,
- smb3_handle_read_data, rdata, flags);
+ smb3_handle_read_data, rdata, flags,
+ &rdata->credits);
if (rc) {
kref_put(&rdata->refcount, cifs_readdata_release);
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
@@ -3332,6 +3416,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
io_parms.offset, io_parms.length, rc);
}
+async_readv_out:
cifs_small_buf_release(buf);
return rc;
}
@@ -3366,8 +3451,6 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
rqst.rq_nvec = 1;
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
- cifs_small_buf_release(req);
-
rsp = (struct smb2_read_rsp *)rsp_iov.iov_base;
if (rc) {
@@ -3378,14 +3461,20 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length,
rc);
- }
+ } else
+ trace_smb3_read_done(xid, req->PersistentFileId,
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, 0);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+ cifs_small_buf_release(req);
return rc == -ENODATA ? 0 : rc;
} else
trace_smb3_read_done(xid, req->PersistentFileId,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length);
+ cifs_small_buf_release(req);
+
*nbytes = le32_to_cpu(rsp->DataLength);
if ((*nbytes > CIFS_MAX_MSGSIZE) ||
(*nbytes > io_parms->length)) {
@@ -3417,14 +3506,16 @@ smb2_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
- unsigned int credits_received = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
- wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
+ wdata->result = smb2_check_receive(mid, server, 0);
if (wdata->result != 0)
break;
@@ -3448,7 +3539,8 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -EAGAIN;
break;
case MID_RESPONSE_MALFORMED:
- credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
/* fall through */
default:
wdata->result = -EIO;
@@ -3481,7 +3573,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &wdata->work);
DeleteMidQEntry(mid);
- add_credits(tcon->ses->server, credits_received, 0);
+ add_credits(server, &credits, 0);
}
/* smb2_async_writev - send an async write, and set up mid to handle result */
@@ -3499,17 +3591,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
unsigned int total_len;
rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len);
- if (rc) {
- if (rc == -EAGAIN && wdata->credits) {
- /* credits was reset by reconnect */
- wdata->credits = 0;
- /* reduce in_flight value since we won't send the req */
- spin_lock(&server->req_lock);
- server->in_flight--;
- spin_unlock(&server->req_lock);
- }
- goto async_writev_out;
- }
+ if (rc)
+ return rc;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -3526,6 +3609,9 @@ smb2_async_writev(struct cifs_writedata *wdata,
req->DataOffset = cpu_to_le16(
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
+
+ trace_smb3_write_enter(0 /* xid */, wdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes);
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a server RDMA read, fill in and append
@@ -3595,23 +3681,22 @@ smb2_async_writev(struct cifs_writedata *wdata,
req->Length = cpu_to_le32(wdata->bytes);
#endif
- if (wdata->credits) {
+ if (wdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
SMB2_MAX_BUFFER_SIZE));
shdr->CreditRequest =
cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
- spin_lock(&server->req_lock);
- server->credits += wdata->credits -
- le16_to_cpu(shdr->CreditCharge);
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
- wdata->credits = le16_to_cpu(shdr->CreditCharge);
+
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+ if (rc)
+ goto async_writev_out;
+
flags |= CIFS_HAS_CREDITS;
}
kref_get(&wdata->refcount);
rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL,
- wdata, flags);
+ wdata, flags, &wdata->credits);
if (rc) {
trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
@@ -3674,6 +3759,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
+ trace_smb3_write_enter(xid, io_parms->persistent_fid,
+ io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length);
+
iov[0].iov_base = (char *)req;
/* 1 for Buffer */
iov[0].iov_len = total_len - 1;
@@ -3684,7 +3773,6 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst,
&resp_buftype, flags, &rsp_iov);
- cifs_small_buf_release(req);
rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
if (rc) {
@@ -3702,6 +3790,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
io_parms->offset, *nbytes);
}
+ cifs_small_buf_release(req);
free_rsp_buf(resp_buftype, rsp);
return rc;
}
@@ -3836,6 +3925,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
+ trace_smb3_query_dir_enter(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, output_size);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
@@ -3843,18 +3935,26 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
if (rc == -ENODATA &&
rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ trace_smb3_query_dir_done(xid, persistent_fid,
+ tcon->tid, tcon->ses->Suid, index, 0);
srch_inf->endOfSearch = true;
rc = 0;
- } else
+ } else {
+ trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, 0, rc);
cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
+ }
goto qdir_exit;
}
rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
info_buf_size);
- if (rc)
+ if (rc) {
+ trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, 0, rc);
goto qdir_exit;
+ }
srch_inf->unicode = true;
@@ -3882,6 +3982,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
else
cifs_dbg(VFS, "illegal search buffer type\n");
+ trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, srch_inf->entries_in_buffer);
return rc;
qdir_exit:
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 538e2299805f..ee8977688e21 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -288,12 +288,12 @@ struct smb2_encryption_neg_context {
__le16 Ciphers[1]; /* Ciphers[0] since only one used now */
} __packed;
-#define POSIX_CTXT_DATA_LEN 8
+#define POSIX_CTXT_DATA_LEN 16
struct smb2_posix_neg_context {
__le16 ContextType; /* 0x100 */
__le16 DataLength;
__le32 Reserved;
- __le64 Reserved1; /* In case needed for future (eg version or caps) */
+ __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
} __packed;
struct smb2_negotiate_rsp {
@@ -959,6 +959,13 @@ struct duplicate_extents_to_file {
__le64 ByteCount; /* Bytes to be copied */
} __packed;
+/*
+ * Maximum number of iovs we need for an ioctl request.
+ * [0] : struct smb2_ioctl_req
+ * [1] : in_data
+ */
+#define SMB2_IOCTL_IOV_SIZE 2
+
struct smb2_ioctl_req {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 57 */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 87733b27a65f..52df125e9189 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -142,8 +142,13 @@ extern int SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
extern void SMB2_open_free(struct smb_rqst *rqst);
extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, char *in_data, u32 indatalen,
+ bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen,
char **out_data, u32 *plen /* returned data len */);
+extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid, u32 opcode,
+ bool is_fsctl, char *in_data, u32 indatalen,
+ __u32 max_response_size);
+extern void SMB2_ioctl_free(struct smb_rqst *rqst);
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
@@ -223,6 +228,9 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
+extern __u8 smb2_parse_lease_state(struct TCP_Server_Info *server,
+ struct smb2_create_rsp *rsp,
+ unsigned int *epoch, char *lease_key);
extern int smb3_encryption_required(const struct cifs_tcon *tcon);
extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
struct kvec *iov, unsigned int min_buf_size);
diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h
index 3d5f62150de4..447c0c6e4c64 100644
--- a/fs/cifs/smb2status.h
+++ b/fs/cifs/smb2status.h
@@ -30,9 +30,9 @@
*/
#define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000)
-#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001)
-#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002)
-#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003)
+#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001)
+#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002)
+#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003)
struct ntstatus {
/* Facility is the high 12 bits of the following field */
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 7b351c65ee46..d1181572758b 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -576,6 +576,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
struct TCP_Server_Info *server)
{
struct mid_q_entry *temp;
+ unsigned int credits = le16_to_cpu(shdr->CreditCharge);
if (server == NULL) {
cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n");
@@ -586,6 +587,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
memset(temp, 0, sizeof(struct mid_q_entry));
kref_init(&temp->refcount);
temp->mid = le64_to_cpu(shdr->MessageId);
+ temp->credits = credits > 0 ? credits : 1;
temp->pid = current->pid;
temp->command = shdr->Command; /* Always LE */
temp->when_alloc = jiffies;
@@ -600,6 +602,8 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
atomic_inc(&midCount);
temp->mid_state = MID_REQUEST_ALLOCATED;
+ trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId,
+ le16_to_cpu(shdr->Command), temp->mid);
return temp;
}
@@ -615,6 +619,10 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
return -EAGAIN;
}
+ if (ses->server->tcpStatus == CifsNeedNegotiate &&
+ shdr->Command != SMB2_NEGOTIATE)
+ return -EAGAIN;
+
if (ses->status == CifsNew) {
if ((shdr->Command != SMB2_SESSION_SETUP) &&
(shdr->Command != SMB2_NEGOTIATE))
@@ -634,6 +642,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
spin_lock(&GlobalMid_Lock);
list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q);
spin_unlock(&GlobalMid_Lock);
+
return 0;
}
@@ -674,13 +683,18 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
smb2_seq_num_into_buf(ses->server, shdr);
rc = smb2_get_mid_entry(ses, shdr, &mid);
- if (rc)
+ if (rc) {
+ revert_current_mid_from_hdr(ses->server, shdr);
return ERR_PTR(rc);
+ }
+
rc = smb2_sign_rqst(rqst, ses->server);
if (rc) {
+ revert_current_mid_from_hdr(ses->server, shdr);
cifs_delete_mid(mid);
return ERR_PTR(rc);
}
+
return mid;
}
@@ -692,14 +706,21 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
(struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
+ if (server->tcpStatus == CifsNeedNegotiate &&
+ shdr->Command != SMB2_NEGOTIATE)
+ return ERR_PTR(-EAGAIN);
+
smb2_seq_num_into_buf(server, shdr);
mid = smb2_mid_entry_alloc(shdr, server);
- if (mid == NULL)
+ if (mid == NULL) {
+ revert_current_mid_from_hdr(server, shdr);
return ERR_PTR(-ENOMEM);
+ }
rc = smb2_sign_rqst(rqst, server);
if (rc) {
+ revert_current_mid_from_hdr(server, shdr);
DeleteMidQEntry(mid);
return ERR_PTR(rc);
}
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index a568dac7b3a1..b943b74cd246 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1550,7 +1550,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
char name[MAX_NAME_LEN];
int rc;
- snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
info->request_cache =
kmem_cache_create(
name,
@@ -1566,7 +1566,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!info->request_mempool)
goto out1;
- snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
info->response_cache =
kmem_cache_create(
name,
@@ -1582,7 +1582,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!info->response_mempool)
goto out3;
- snprintf(name, MAX_NAME_LEN, "smbd_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_%p", info);
info->workqueue = create_workqueue(name);
if (!info->workqueue)
goto out4;
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index 59be48206932..99c4d799c24b 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -58,6 +58,9 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
DEFINE_SMB3_RW_ERR_EVENT(write_err);
DEFINE_SMB3_RW_ERR_EVENT(read_err);
+DEFINE_SMB3_RW_ERR_EVENT(query_dir_err);
+DEFINE_SMB3_RW_ERR_EVENT(zero_err);
+DEFINE_SMB3_RW_ERR_EVENT(falloc_err);
/* For logging successful read or write */
@@ -100,8 +103,16 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
__u32 len), \
TP_ARGS(xid, fid, tid, sesid, offset, len))
+DEFINE_SMB3_RW_DONE_EVENT(write_enter);
+DEFINE_SMB3_RW_DONE_EVENT(read_enter);
+DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter);
+DEFINE_SMB3_RW_DONE_EVENT(zero_enter);
+DEFINE_SMB3_RW_DONE_EVENT(falloc_enter);
DEFINE_SMB3_RW_DONE_EVENT(write_done);
DEFINE_SMB3_RW_DONE_EVENT(read_done);
+DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
+DEFINE_SMB3_RW_DONE_EVENT(zero_done);
+DEFINE_SMB3_RW_DONE_EVENT(falloc_done);
/*
* For handle based calls other than read and write, and get/set info
@@ -148,6 +159,48 @@ DEFINE_SMB3_FD_ERR_EVENT(close_err);
/*
* For handle based query/set info calls
*/
+DECLARE_EVENT_CLASS(smb3_inf_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u8 infclass,
+ __u32 type),
+ TP_ARGS(xid, fid, tid, sesid, infclass, type),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u8, infclass)
+ __field(__u32, type)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->infclass = infclass;
+ __entry->type = type;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->infclass, __entry->type)
+)
+
+#define DEFINE_SMB3_INF_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_inf_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u8 infclass, \
+ __u32 type), \
+ TP_ARGS(xid, fid, tid, sesid, infclass, type))
+
+DEFINE_SMB3_INF_ENTER_EVENT(query_info_enter);
+DEFINE_SMB3_INF_ENTER_EVENT(query_info_done);
+
DECLARE_EVENT_CLASS(smb3_inf_err_class,
TP_PROTO(unsigned int xid,
__u64 fid,
@@ -195,6 +248,123 @@ DEFINE_SMB3_INF_ERR_EVENT(query_info_err);
DEFINE_SMB3_INF_ERR_EVENT(set_info_err);
DEFINE_SMB3_INF_ERR_EVENT(fsctl_err);
+DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ const char *full_path),
+ TP_ARGS(xid, tid, sesid, full_path),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __string(path, full_path)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __assign_str(path, full_path);
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __get_str(path))
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ const char *full_path), \
+ TP_ARGS(xid, tid, sesid, full_path))
+
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
+
+
+DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid),
+ TP_ARGS(xid, tid, sesid),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid)
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_done_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid), \
+ TP_ARGS(xid, tid, sesid))
+
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
+
+
+DECLARE_EVENT_CLASS(smb3_inf_compound_err_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ int rc),
+ TP_ARGS(xid, tid, sesid, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->rc = rc;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __entry->rc)
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int rc), \
+ TP_ARGS(xid, tid, sesid, rc))
+
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
+
/*
* For logging SMB3 Status code and Command for responses which return errors
*/
@@ -270,6 +440,7 @@ DEFINE_EVENT(smb3_cmd_done_class, smb3_##name, \
__u64 mid), \
TP_ARGS(tid, sesid, cmd, mid))
+DEFINE_SMB3_CMD_DONE_EVENT(cmd_enter);
DEFINE_SMB3_CMD_DONE_EVENT(cmd_done);
DEFINE_SMB3_CMD_DONE_EVENT(ses_expired);
@@ -378,19 +549,19 @@ DECLARE_EVENT_CLASS(smb3_tcon_class,
__field(unsigned int, xid)
__field(__u32, tid)
__field(__u64, sesid)
- __field(const char *, unc_name)
+ __string(name, unc_name)
__field(int, rc)
),
TP_fast_assign(
__entry->xid = xid;
__entry->tid = tid;
__entry->sesid = sesid;
- __entry->unc_name = unc_name;
+ __assign_str(name, unc_name);
__entry->rc = rc;
),
TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d",
__entry->xid, __entry->sesid, __entry->tid,
- __entry->unc_name, __entry->rc)
+ __get_str(name), __entry->rc)
)
#define DEFINE_SMB3_TCON_EVENT(name) \
@@ -406,8 +577,47 @@ DEFINE_SMB3_TCON_EVENT(tcon);
/*
- * For smb2/smb3 open call
+ * For smb2/smb3 open (including create and mkdir) calls
*/
+
+DECLARE_EVENT_CLASS(smb3_open_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ int create_options,
+ int desired_access),
+ TP_ARGS(xid, tid, sesid, create_options, desired_access),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, create_options)
+ __field(int, desired_access)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->create_options = create_options;
+ __entry->desired_access = desired_access;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __entry->create_options, __entry->desired_access)
+)
+
+#define DEFINE_SMB3_OPEN_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_open_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int create_options, \
+ int desired_access), \
+ TP_ARGS(xid, tid, sesid, create_options, desired_access))
+
+DEFINE_SMB3_OPEN_ENTER_EVENT(open_enter);
+DEFINE_SMB3_OPEN_ENTER_EVENT(posix_mkdir_enter);
+
DECLARE_EVENT_CLASS(smb3_open_err_class,
TP_PROTO(unsigned int xid,
__u32 tid,
@@ -626,6 +836,7 @@ DEFINE_EVENT(smb3_credit_class, smb3_##name, \
TP_ARGS(currmid, hostname, credits))
DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits);
+DEFINE_SMB3_CREDIT_EVENT(credit_timeout);
#endif /* _CIFS_TRACE_H */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 53532bd3f50d..1de8e996e566 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -33,6 +33,7 @@
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/mempool.h>
+#include <linux/signal.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -291,6 +292,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
int n_vec;
unsigned int send_length = 0;
unsigned int i, j;
+ sigset_t mask, oldmask;
size_t total_len = 0, sent, size;
struct socket *ssocket = server->ssocket;
struct msghdr smb_msg;
@@ -301,8 +303,14 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rc = smbd_send(server, rqst);
goto smbd_done;
}
+
if (ssocket == NULL)
- return -ENOTSOCK;
+ return -EAGAIN;
+
+ if (signal_pending(current)) {
+ cifs_dbg(FYI, "signal is pending before sending any data\n");
+ return -EINTR;
+ }
/* cork the socket */
kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
@@ -312,6 +320,16 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
send_length += smb_rqst_len(server, &rqst[j]);
rfc1002_marker = cpu_to_be32(send_length);
+ /*
+ * We should not allow signals to interrupt the network send because
+ * any partial send will cause session reconnects thus increasing
+ * latency of system calls and overload a server with unnecessary
+ * requests.
+ */
+
+ sigfillset(&mask);
+ sigprocmask(SIG_BLOCK, &mask, &oldmask);
+
/* Generate a rfc1002 marker for SMB2+ */
if (server->vals->header_preamble_size == 0) {
struct kvec hiov = {
@@ -321,7 +339,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
- goto uncork;
+ goto unmask;
total_len += sent;
send_length += 4;
@@ -343,7 +361,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
- goto uncork;
+ goto unmask;
total_len += sent;
@@ -365,7 +383,25 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
}
}
-uncork:
+unmask:
+ sigprocmask(SIG_SETMASK, &oldmask, NULL);
+
+ /*
+ * If signal is pending but we have already sent the whole packet to
+ * the server we need to return success status to allow a corresponding
+ * mid entry to be kept in the pending requests queue thus allowing
+ * to handle responses from the server by the client.
+ *
+ * If only part of the packet has been sent there is no need to hide
+ * interrupt because the session will be reconnected anyway, so there
+ * won't be any response from the server to handle.
+ */
+
+ if (signal_pending(current) && (total_len != send_length)) {
+ cifs_dbg(FYI, "signal is pending after attempt to send\n");
+ rc = -EINTR;
+ }
+
/* uncork it */
val = 0;
kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
@@ -450,29 +486,55 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
}
static int
-wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
- int *credits)
+wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
+ const int timeout, const int flags,
+ unsigned int *instance)
{
int rc;
+ int *credits;
+ int optype;
+ long int t;
+
+ if (timeout < 0)
+ t = MAX_JIFFY_OFFSET;
+ else
+ t = msecs_to_jiffies(timeout);
+
+ optype = flags & CIFS_OP_MASK;
+
+ *instance = 0;
+
+ credits = server->ops->get_credits_field(server, optype);
+ /* Since an echo is already inflight, no need to wait to send another */
+ if (*credits <= 0 && optype == CIFS_ECHO_OP)
+ return -EAGAIN;
spin_lock(&server->req_lock);
- if (timeout == CIFS_ASYNC_OP) {
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) {
/* oplock breaks must not be held up */
server->in_flight++;
*credits -= 1;
+ *instance = server->reconnect_instance;
spin_unlock(&server->req_lock);
return 0;
}
while (1) {
- if (*credits <= 0) {
+ if (*credits < num_credits) {
spin_unlock(&server->req_lock);
cifs_num_waiters_inc(server);
- rc = wait_event_killable(server->request_q,
- has_credits(server, credits));
+ rc = wait_event_killable_timeout(server->request_q,
+ has_credits(server, credits, num_credits), t);
cifs_num_waiters_dec(server);
- if (rc)
- return rc;
+ if (!rc) {
+ trace_smb3_credit_timeout(server->CurrentMid,
+ server->hostname, num_credits);
+ cifs_dbg(VFS, "wait timed out after %d ms\n",
+ timeout);
+ return -ENOTSUPP;
+ }
+ if (rc == -ERESTARTSYS)
+ return -ERESTARTSYS;
spin_lock(&server->req_lock);
} else {
if (server->tcpStatus == CifsExiting) {
@@ -481,14 +543,53 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
}
/*
+ * For normal commands, reserve the last MAX_COMPOUND
+ * credits to compound requests.
+ * Otherwise these compounds could be permanently
+ * starved for credits by single-credit requests.
+ *
+ * To prevent spinning CPU, block this thread until
+ * there are >MAX_COMPOUND credits available.
+ * But only do this is we already have a lot of
+ * credits in flight to avoid triggering this check
+ * for servers that are slow to hand out credits on
+ * new sessions.
+ */
+ if (!optype && num_credits == 1 &&
+ server->in_flight > 2 * MAX_COMPOUND &&
+ *credits <= MAX_COMPOUND) {
+ spin_unlock(&server->req_lock);
+ cifs_num_waiters_inc(server);
+ rc = wait_event_killable_timeout(
+ server->request_q,
+ has_credits(server, credits,
+ MAX_COMPOUND + 1),
+ t);
+ cifs_num_waiters_dec(server);
+ if (!rc) {
+ trace_smb3_credit_timeout(
+ server->CurrentMid,
+ server->hostname, num_credits);
+ cifs_dbg(VFS, "wait timed out after %d ms\n",
+ timeout);
+ return -ENOTSUPP;
+ }
+ if (rc == -ERESTARTSYS)
+ return -ERESTARTSYS;
+ spin_lock(&server->req_lock);
+ continue;
+ }
+
+ /*
* Can not count locking commands against total
* as they are allowed to block on server.
*/
/* update # of requests on the wire to server */
- if (timeout != CIFS_BLOCKING_OP) {
- *credits -= 1;
- server->in_flight++;
+ if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) {
+ *credits -= num_credits;
+ server->in_flight += num_credits;
+ *instance = server->reconnect_instance;
}
spin_unlock(&server->req_lock);
break;
@@ -498,24 +599,45 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
}
static int
-wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
- const int optype)
+wait_for_free_request(struct TCP_Server_Info *server, const int flags,
+ unsigned int *instance)
{
- int *val;
+ return wait_for_free_credits(server, 1, -1, flags,
+ instance);
+}
- val = server->ops->get_credits_field(server, optype);
- /* Since an echo is already inflight, no need to wait to send another */
- if (*val <= 0 && optype == CIFS_ECHO_OP)
- return -EAGAIN;
- return wait_for_free_credits(server, timeout, val);
+static int
+wait_for_compound_request(struct TCP_Server_Info *server, int num,
+ const int flags, unsigned int *instance)
+{
+ int *credits;
+
+ credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK);
+
+ spin_lock(&server->req_lock);
+ if (*credits < num) {
+ /*
+ * Return immediately if not too many requests in flight since
+ * we will likely be stuck on waiting for credits.
+ */
+ if (server->in_flight < num - *credits) {
+ spin_unlock(&server->req_lock);
+ return -ENOTSUPP;
+ }
+ }
+ spin_unlock(&server->req_lock);
+
+ return wait_for_free_credits(server, num, 60000, flags,
+ instance);
}
int
cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, unsigned int *credits)
+ unsigned int *num, struct cifs_credits *credits)
{
*num = size;
- *credits = 0;
+ credits->value = 0;
+ credits->instance = server->reconnect_instance;
return 0;
}
@@ -602,27 +724,43 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
int
cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
mid_receive_t *receive, mid_callback_t *callback,
- mid_handle_t *handle, void *cbdata, const int flags)
+ mid_handle_t *handle, void *cbdata, const int flags,
+ const struct cifs_credits *exist_credits)
{
- int rc, timeout, optype;
+ int rc;
struct mid_q_entry *mid;
- unsigned int credits = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
+ unsigned int instance;
+ int optype;
- timeout = flags & CIFS_TIMEOUT_MASK;
optype = flags & CIFS_OP_MASK;
if ((flags & CIFS_HAS_CREDITS) == 0) {
- rc = wait_for_free_request(server, timeout, optype);
+ rc = wait_for_free_request(server, flags, &instance);
if (rc)
return rc;
- credits = 1;
- }
+ credits.value = 1;
+ credits.instance = instance;
+ } else
+ instance = exist_credits->instance;
mutex_lock(&server->srv_mutex);
+
+ /*
+ * We can't use credits obtained from the previous session to send this
+ * request. Check if there were reconnects after we obtained credits and
+ * return -EAGAIN in such cases to let callers handle it.
+ */
+ if (instance != server->reconnect_instance) {
+ mutex_unlock(&server->srv_mutex);
+ add_credits_and_wake_if(server, &credits, optype);
+ return -EAGAIN;
+ }
+
mid = server->ops->setup_async_request(server, rqst);
if (IS_ERR(mid)) {
mutex_unlock(&server->srv_mutex);
- add_credits_and_wake_if(server, credits, optype);
+ add_credits_and_wake_if(server, &credits, optype);
return PTR_ERR(mid);
}
@@ -647,6 +785,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
cifs_in_send_dec(server);
if (rc < 0) {
+ revert_current_mid(server, mid->credits);
server->sequence_number -= 2;
cifs_delete_mid(mid);
}
@@ -656,7 +795,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
if (rc == 0)
return 0;
- add_credits_and_wake_if(server, credits, optype);
+ add_credits_and_wake_if(server, &credits, optype);
return rc;
}
@@ -786,8 +925,12 @@ static void
cifs_compound_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->server;
+ struct cifs_credits credits;
+
+ credits.value = server->ops->get_credits(mid);
+ credits.instance = server->reconnect_instance;
- add_credits(server, server->ops->get_credits(mid), mid->optype);
+ add_credits(server, &credits, mid->optype);
}
static void
@@ -809,14 +952,15 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
const int flags, const int num_rqst, struct smb_rqst *rqst,
int *resp_buf_type, struct kvec *resp_iov)
{
- int i, j, rc = 0;
- int timeout, optype;
+ int i, j, optype, rc = 0;
struct mid_q_entry *midQ[MAX_COMPOUND];
bool cancelled_mid[MAX_COMPOUND] = {false};
- unsigned int credits[MAX_COMPOUND] = {0};
+ struct cifs_credits credits[MAX_COMPOUND] = {
+ { .value = 0, .instance = 0 }
+ };
+ unsigned int instance;
char *buf;
- timeout = flags & CIFS_TIMEOUT_MASK;
optype = flags & CIFS_OP_MASK;
for (i = 0; i < num_rqst; i++)
@@ -831,30 +975,21 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
return -ENOENT;
/*
- * Ensure we obtain 1 credit per request in the compound chain.
- * It can be optimized further by waiting for all the credits
- * at once but this can wait long enough if we don't have enough
- * credits due to some heavy operations in progress or the server
- * not granting us much, so a fallback to the current approach is
- * needed anyway.
+ * Wait for all the requests to become available.
+ * This approach still leaves the possibility to be stuck waiting for
+ * credits if the server doesn't grant credits to the outstanding
+ * requests and if the client is completely idle, not generating any
+ * other requests.
+ * This can be handled by the eventual session reconnect.
*/
+ rc = wait_for_compound_request(ses->server, num_rqst, flags,
+ &instance);
+ if (rc)
+ return rc;
+
for (i = 0; i < num_rqst; i++) {
- rc = wait_for_free_request(ses->server, timeout, optype);
- if (rc) {
- /*
- * We haven't sent an SMB packet to the server yet but
- * we already obtained credits for i requests in the
- * compound chain - need to return those credits back
- * for future use. Note that we need to call add_credits
- * multiple times to match the way we obtained credits
- * in the first place and to account for in flight
- * requests correctly.
- */
- for (j = 0; j < i; j++)
- add_credits(ses->server, 1, optype);
- return rc;
- }
- credits[i] = 1;
+ credits[i].value = 1;
+ credits[i].instance = instance;
}
/*
@@ -865,16 +1000,31 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
mutex_lock(&ses->server->srv_mutex);
+ /*
+ * All the parts of the compound chain belong obtained credits from the
+ * same session. We can not use credits obtained from the previous
+ * session to send this request. Check if there were reconnects after
+ * we obtained credits and return -EAGAIN in such cases to let callers
+ * handle it.
+ */
+ if (instance != ses->server->reconnect_instance) {
+ mutex_unlock(&ses->server->srv_mutex);
+ for (j = 0; j < num_rqst; j++)
+ add_credits(ses->server, &credits[j], optype);
+ return -EAGAIN;
+ }
+
for (i = 0; i < num_rqst; i++) {
midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]);
if (IS_ERR(midQ[i])) {
+ revert_current_mid(ses->server, i);
for (j = 0; j < i; j++)
cifs_delete_mid(midQ[j]);
mutex_unlock(&ses->server->srv_mutex);
/* Update # of requests on wire to server */
for (j = 0; j < num_rqst; j++)
- add_credits(ses->server, credits[j], optype);
+ add_credits(ses->server, &credits[j], optype);
return PTR_ERR(midQ[i]);
}
@@ -897,15 +1047,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
for (i = 0; i < num_rqst; i++)
cifs_save_when_sent(midQ[i]);
- if (rc < 0)
+ if (rc < 0) {
+ revert_current_mid(ses->server, num_rqst);
ses->server->sequence_number -= 2;
+ }
mutex_unlock(&ses->server->srv_mutex);
if (rc < 0) {
/* Sending failed for some reason - return credits back */
for (i = 0; i < num_rqst; i++)
- add_credits(ses->server, credits[i], optype);
+ add_credits(ses->server, &credits[i], optype);
goto out;
}
@@ -924,7 +1076,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
smb311_update_preauth_hash(ses, rqst[0].rq_iov,
rqst[0].rq_nvec);
- if (timeout == CIFS_ASYNC_OP)
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP)
goto out;
for (i = 0; i < num_rqst; i++) {
@@ -942,7 +1094,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
- credits[i] = 0;
+ credits[i].value = 0;
}
spin_unlock(&GlobalMid_Lock);
}
@@ -1061,13 +1213,14 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
int
SendReceive(const unsigned int xid, struct cifs_ses *ses,
struct smb_hdr *in_buf, struct smb_hdr *out_buf,
- int *pbytes_returned, const int timeout)
+ int *pbytes_returned, const int flags)
{
int rc = 0;
struct mid_q_entry *midQ;
unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
struct kvec iov = { .iov_base = in_buf, .iov_len = len };
struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
if (ses == NULL) {
cifs_dbg(VFS, "Null smb session\n");
@@ -1091,7 +1244,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- rc = wait_for_free_request(ses->server, timeout, 0);
+ rc = wait_for_free_request(ses->server, flags, &credits.instance);
if (rc)
return rc;
@@ -1105,7 +1258,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc) {
mutex_unlock(&ses->server->srv_mutex);
/* Update # of requests on wire to server */
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1130,7 +1283,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc < 0)
goto out;
- if (timeout == CIFS_ASYNC_OP)
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP)
goto out;
rc = wait_for_response(ses->server, midQ);
@@ -1141,7 +1294,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
/* no longer considered to be "in-flight" */
midQ->callback = DeleteMidQEntry;
spin_unlock(&GlobalMid_Lock);
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
spin_unlock(&GlobalMid_Lock);
@@ -1149,7 +1302,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_sync_mid_result(midQ, ses->server);
if (rc != 0) {
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1165,7 +1318,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_check_receive(midQ, ses->server, 0);
out:
cifs_delete_mid(midQ);
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1207,6 +1360,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
struct kvec iov = { .iov_base = in_buf, .iov_len = len };
struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+ unsigned int instance;
if (tcon == NULL || tcon->ses == NULL) {
cifs_dbg(VFS, "Null smb session\n");
@@ -1232,7 +1386,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return -EIO;
}
- rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0);
+ rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, &instance);
if (rc)
return rc;
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 02b7d91c9231..f0de238000c0 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -1,16 +1,16 @@
config FS_ENCRYPTION
- tristate "FS Encryption (Per-file encryption)"
+ bool "FS Encryption (Per-file encryption)"
select CRYPTO
select CRYPTO_AES
select CRYPTO_CBC
select CRYPTO_ECB
select CRYPTO_XTS
select CRYPTO_CTS
- select CRYPTO_CTR
select CRYPTO_SHA256
select KEYS
help
Enable encryption of files and directories. This
feature is similar to ecryptfs, but it is more memory
efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
+ decrypted pages in the page cache. Currently Ext4,
+ F2FS and UBIFS make use of this feature.
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 7424f851eb5c..7da276159593 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -12,7 +12,6 @@
#ifndef _FSCRYPT_PRIVATE_H
#define _FSCRYPT_PRIVATE_H
-#define __FS_HAS_ENCRYPTION 1
#include <linux/fscrypt.h>
#include <crypto/hash.h>
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 926e5df20ec3..56debb1fcf5e 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -58,7 +58,7 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
return err;
if (!fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+ return -EXDEV;
return 0;
}
@@ -82,13 +82,13 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
if (IS_ENCRYPTED(new_dir) &&
!fscrypt_has_permitted_context(new_dir,
d_inode(old_dentry)))
- return -EPERM;
+ return -EXDEV;
if ((flags & RENAME_EXCHANGE) &&
IS_ENCRYPTED(old_dir) &&
!fscrypt_has_permitted_context(old_dir,
d_inode(new_dentry)))
- return -EPERM;
+ return -EXDEV;
}
return 0;
}
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 322ce9686bdb..2cb4956f8511 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -402,7 +402,6 @@ static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
{
SHASH_DESC_ON_STACK(desc, tfm);
desc->tfm = tfm;
- desc->flags = 0;
return crypto_shash_digest(desc, key, keysize, salt);
}
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index f490de921ce8..bd7eaf9b3f00 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -151,8 +151,7 @@ EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
* malicious offline violations of this constraint, while the link and rename
* checks are needed to prevent online violations of this constraint.
*
- * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail
- * the filesystem operation with EPERM.
+ * Return: 1 if permitted, 0 if forbidden.
*/
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
diff --git a/fs/dax.c b/fs/dax.c
index 6959837cc465..e5e54da1715f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -33,6 +33,7 @@
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
+#include <asm/pgalloc.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -788,7 +789,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
address = pgoff_address(index, vma);
/*
- * Note because we provide start/end to follow_pte_pmd it will
+ * Note because we provide range to follow_pte_pmd it will
* call mmu_notifier_invalidate_range_start() on our behalf
* before taking any lock.
*/
@@ -843,9 +844,8 @@ unlock_pte:
static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
struct address_space *mapping, void *entry)
{
- unsigned long pfn;
+ unsigned long pfn, index, count;
long ret = 0;
- size_t size;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
@@ -894,17 +894,18 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
xas_unlock_irq(xas);
/*
- * Even if dax_writeback_mapping_range() was given a wbc->range_start
- * in the middle of a PMD, the 'index' we are given will be aligned to
- * the start index of the PMD, as will the pfn we pull from 'entry'.
+ * If dax_writeback_mapping_range() was given a wbc->range_start
+ * in the middle of a PMD, the 'index' we use needs to be
+ * aligned to the start of the PMD.
* This allows us to flush for PMD_SIZE and not have to worry about
* partial PMD writebacks.
*/
pfn = dax_to_pfn(entry);
- size = PAGE_SIZE << dax_entry_order(entry);
+ count = 1UL << dax_entry_order(entry);
+ index = xas->xa_index & ~(count - 1);
- dax_entry_mkclean(mapping, xas->xa_index, pfn);
- dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
+ dax_entry_mkclean(mapping, index, pfn);
+ dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
@@ -917,8 +918,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
dax_wake_entry(xas, entry, false);
- trace_dax_writeback_one(mapping->host, xas->xa_index,
- size >> PAGE_SHIFT);
+ trace_dax_writeback_one(mapping->host, index, count);
return ret;
put_unlocked:
@@ -1220,9 +1220,7 @@ static vm_fault_t dax_fault_return(int error)
{
if (error == 0)
return VM_FAULT_NOPAGE;
- if (error == -ENOMEM)
- return VM_FAULT_OOM;
- return VM_FAULT_SIGBUS;
+ return vmf_error(error);
}
/*
@@ -1410,7 +1408,9 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
struct inode *inode = mapping->host;
+ pgtable_t pgtable = NULL;
struct page *zero_page;
spinlock_t *ptl;
pmd_t pmd_entry;
@@ -1425,12 +1425,22 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
DAX_PMD | DAX_ZERO_PAGE, false);
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
spin_unlock(ptl);
goto fallback;
}
+ if (pgtable) {
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ mm_inc_nr_ptes(vma->vm_mm);
+ }
pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
pmd_entry = pmd_mkhuge(pmd_entry);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
@@ -1439,6 +1449,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
return VM_FAULT_NOPAGE;
fallback:
+ if (pgtable)
+ pte_free(vma->vm_mm, pgtable);
trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
return VM_FAULT_FALLBACK;
}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 95b5e78c22b1..f25daa207421 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -163,19 +163,24 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
-static void debugfs_evict_inode(struct inode *inode)
+static void debugfs_i_callback(struct rcu_head *head)
{
- truncate_inode_pages_final(&inode->i_data);
- clear_inode(inode);
+ struct inode *inode = container_of(head, struct inode, i_rcu);
if (S_ISLNK(inode->i_mode))
kfree(inode->i_link);
+ free_inode_nonrcu(inode);
+}
+
+static void debugfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, debugfs_i_callback);
}
static const struct super_operations debugfs_super_operations = {
.statfs = simple_statfs,
.remount_fs = debugfs_remount,
.show_options = debugfs_show_options,
- .evict_inode = debugfs_evict_inode,
+ .destroy_inode = debugfs_destroy_inode,
};
static void debugfs_release_dentry(struct dentry *dentry)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c53814539070..553a3f3300ae 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -455,6 +455,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_blocksize_bits = 10;
s->s_magic = DEVPTS_SUPER_MAGIC;
s->s_op = &devpts_sops;
+ s->s_d_op = &simple_dentry_operations;
s->s_time_gran = 1;
error = -ENOMEM;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f664da55234e..491cf5baa8c2 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -68,7 +68,6 @@ static int ecryptfs_hash_digest(struct crypto_shash *tfm,
int err;
desc->tfm = tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
err = crypto_shash_digest(desc, src, len, dst);
shash_desc_zero(desc);
return err;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e74fe84d0886..90fbac5d485b 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -769,7 +769,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
}
s->hash_desc->tfm = s->hash_tfm;
- s->hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
rc = crypto_shash_digest(s->hash_desc,
(u8 *)s->auth_tok->token.password.session_key_encryption_key,
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
deleted file mode 100644
index 1b2d4c63a579..000000000000
--- a/fs/exofs/BUGS
+++ /dev/null
@@ -1,3 +0,0 @@
-- Out-of-space may cause a severe problem if the object (and directory entry)
- were written, but the inode attributes failed. Then if the filesystem was
- unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
deleted file mode 100644
index a364fd0965ec..000000000000
--- a/fs/exofs/Kbuild
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Kbuild for the EXOFS module
-#
-# Copyright (C) 2008 Panasas Inc. All rights reserved.
-#
-# Authors:
-# Boaz Harrosh <ooo@electrozaur.com>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2
-#
-# Kbuild - Gets included from the Kernels Makefile and build system
-#
-
-# ore module library
-libore-y := ore.o ore_raid.o
-obj-$(CONFIG_ORE) += libore.o
-
-exofs-y := inode.o file.o namei.o dir.o super.o sys.o
-obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
deleted file mode 100644
index 86194b2f799d..000000000000
--- a/fs/exofs/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-config EXOFS_FS
- tristate "exofs: OSD based file system support"
- depends on SCSI_OSD_ULD
- help
- EXOFS is a file system that uses an OSD storage device,
- as its backing storage.
-
-# Debugging-related stuff
-config EXOFS_DEBUG
- bool "Enable debugging"
- depends on EXOFS_FS
- help
- This option enables EXOFS debug prints.
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
deleted file mode 100644
index 2daf2329c28d..000000000000
--- a/fs/exofs/Kconfig.ore
+++ /dev/null
@@ -1,14 +0,0 @@
-# ORE - Objects Raid Engine (libore.ko)
-#
-# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
-# for every ORE user we do it like this. Any user should add itself here
-# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
-# selected here, and we default to "ON". So in effect it is like been
-# selected by any of the users.
-config ORE
- tristate
- depends on EXOFS_FS || PNFS_OBJLAYOUT
- select ASYNC_XOR
- select RAID6_PQ
- select ASYNC_PQ
- default SCSI_OSD_ULD
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
deleted file mode 100644
index 7d88ef566213..000000000000
--- a/fs/exofs/common.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * common.h - Common definitions for both Kernel and user-mode utilities
- *
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef __EXOFS_COM_H__
-#define __EXOFS_COM_H__
-
-#include <linux/types.h>
-
-#include <scsi/osd_attributes.h>
-#include <scsi/osd_initiator.h>
-#include <scsi/osd_sec.h>
-
-/****************************************************************************
- * Object ID related defines
- * NOTE: inode# = object ID - EXOFS_OBJ_OFF
- ****************************************************************************/
-#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
-#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
-#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
-#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
-#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
-
-/* exofs Application specific page/attribute */
-/* Inode attrs */
-# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
-# define EXOFS_ATTR_INODE_DATA 1
-# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
-# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
-/* Partition attrs */
-# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3)
-# define EXOFS_ATTR_SB_STATS 1
-
-/*
- * The maximum number of files we can have is limited by the size of the
- * inode number. This is the largest object ID that the file system supports.
- * Object IDs 0, 1, and 2 are always in use (see above defines).
- */
-enum {
- EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
- (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
- EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
-};
-
-/****************************************************************************
- * Misc.
- ****************************************************************************/
-#define EXOFS_BLKSHIFT 12
-#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT)
-
-/****************************************************************************
- * superblock-related things
- ****************************************************************************/
-#define EXOFS_SUPER_MAGIC 0x5DF5
-
-/*
- * The file system control block - stored in object EXOFS_SUPER_ID's data.
- * This is where the in-memory superblock is stored on disk.
- */
-enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
-struct exofs_fscb {
- __le64 s_nextid; /* Only used after mkfs */
- __le64 s_numfiles; /* Only used after mkfs */
- __le32 s_version; /* == EXOFS_FSCB_VER */
- __le16 s_magic; /* Magic signature */
- __le16 s_newfs; /* Non-zero if this is a new fs */
-
- /* From here on it's a static part, only written by mkexofs */
- __le64 s_dev_table_oid; /* Resurved, not used */
- __le64 s_dev_table_count; /* == 0 means no dev_table */
-} __packed;
-
-/*
- * This struct is set on the FS partition's attributes.
- * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
- * with the create command, to atomically persist the sb writeable information.
- */
-struct exofs_sb_stats {
- __le64 s_nextid; /* Highest object ID used */
- __le64 s_numfiles; /* Number of files on fs */
-} __packed;
-
-/*
- * Describes the raid used in the FS. It is part of the device table.
- * This here is taken from the pNFS-objects definition. In exofs we
- * use one raid policy through-out the filesystem. (NOTE: the funny
- * alignment at beginning. We take care of it at exofs_device_table.
- */
-struct exofs_dt_data_map {
- __le32 cb_num_comps;
- __le64 cb_stripe_unit;
- __le32 cb_group_width;
- __le32 cb_group_depth;
- __le32 cb_mirror_cnt;
- __le32 cb_raid_algorithm;
-} __packed;
-
-/*
- * This is an osd device information descriptor. It is a single entry in
- * the exofs device table. It describes an osd target lun which
- * contains data belonging to this FS. (Same partition_id on all devices)
- */
-struct exofs_dt_device_info {
- __le32 systemid_len;
- u8 systemid[OSD_SYSTEMID_LEN];
- __le64 long_name_offset; /* If !0 then offset-in-file */
- __le32 osdname_len; /* */
- u8 osdname[44]; /* Embbeded, Usually an asci uuid */
-} __packed;
-
-/*
- * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
- * It contains the raid used for this multy-device FS and an array of
- * participating devices.
- */
-struct exofs_device_table {
- __le32 dt_version; /* == EXOFS_DT_VER */
- struct exofs_dt_data_map dt_data_map; /* Raid policy to use */
-
- /* Resurved space For future use. Total includeing this:
- * (8 * sizeof(le64))
- */
- __le64 __Resurved[4];
-
- __le64 dt_num_devices; /* Array size */
- struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */
-} __packed;
-
-/****************************************************************************
- * inode-related things
- ****************************************************************************/
-#define EXOFS_IDATA 5
-
-/*
- * The file control block - stored in an object's attributes. This is where
- * the in-memory inode is stored on disk.
- */
-struct exofs_fcb {
- __le64 i_size; /* Size of the file */
- __le16 i_mode; /* File mode */
- __le16 i_links_count; /* Links count */
- __le32 i_uid; /* Owner Uid */
- __le32 i_gid; /* Group Id */
- __le32 i_atime; /* Access time */
- __le32 i_ctime; /* Creation time */
- __le32 i_mtime; /* Modification time */
- __le32 i_flags; /* File flags (unused for now)*/
- __le32 i_generation; /* File version (for NFS) */
- __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */
-};
-
-#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb)
-
-/* This is the Attribute the fcb is stored in */
-static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
- EXOFS_APAGE_FS_DATA,
- EXOFS_ATTR_INODE_DATA,
- EXOFS_INO_ATTR_SIZE);
-
-/****************************************************************************
- * dentry-related things
- ****************************************************************************/
-#define EXOFS_NAME_LEN 255
-
-/*
- * The on-disk directory entry
- */
-struct exofs_dir_entry {
- __le64 inode_no; /* inode number */
- __le16 rec_len; /* directory entry length */
- u8 name_len; /* name length */
- u8 file_type; /* umm...file type */
- char name[EXOFS_NAME_LEN]; /* file name */
-};
-
-enum {
- EXOFS_FT_UNKNOWN,
- EXOFS_FT_REG_FILE,
- EXOFS_FT_DIR,
- EXOFS_FT_CHRDEV,
- EXOFS_FT_BLKDEV,
- EXOFS_FT_FIFO,
- EXOFS_FT_SOCK,
- EXOFS_FT_SYMLINK,
- EXOFS_FT_MAX
-};
-
-#define EXOFS_DIR_PAD 4
-#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1)
-#define EXOFS_DIR_REC_LEN(name_len) \
- (((name_len) + offsetof(struct exofs_dir_entry, name) + \
- EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
-
-/*
- * The on-disk (optional) layout structure.
- * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
- * attribute, attached to any inode, usually to a directory.
- */
-
-enum exofs_inode_layout_gen_functions {
- LAYOUT_MOVING_WINDOW = 0,
- LAYOUT_IMPLICT = 1,
-};
-
-struct exofs_on_disk_inode_layout {
- __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
- __le16 pad;
- union {
- /* gen_func == LAYOUT_MOVING_WINDOW (default) */
- struct exofs_layout_sliding_window {
- __le32 num_devices; /* first n devices in global-table*/
- } sliding_window __packed;
-
- /* gen_func == LAYOUT_IMPLICT */
- struct exofs_layout_implict_list {
- struct exofs_dt_data_map data_map;
- /* Variable array of size data_map.cb_num_comps. These
- * are device indexes of the devices in the global table
- */
- __le32 dev_indexes[];
- } implict __packed;
- };
-} __packed;
-
-static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
-{
- return sizeof(struct exofs_on_disk_inode_layout) +
- max_devs * sizeof(__le32);
-}
-
-#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
deleted file mode 100644
index f0138674c1ed..000000000000
--- a/fs/exofs/dir.c
+++ /dev/null
@@ -1,661 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/iversion.h>
-#include "exofs.h"
-
-static inline unsigned exofs_chunk_size(struct inode *inode)
-{
- return inode->i_sb->s_blocksize;
-}
-
-static inline void exofs_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
-static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
-{
- loff_t last_byte = inode->i_size;
-
- last_byte -= page_nr << PAGE_SHIFT;
- if (last_byte > PAGE_SIZE)
- last_byte = PAGE_SIZE;
- return last_byte;
-}
-
-static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
-{
- struct address_space *mapping = page->mapping;
- struct inode *dir = mapping->host;
- int err = 0;
-
- inode_inc_iversion(dir);
-
- if (!PageUptodate(page))
- SetPageUptodate(page);
-
- if (pos+len > dir->i_size) {
- i_size_write(dir, pos+len);
- mark_inode_dirty(dir);
- }
- set_page_dirty(page);
-
- if (IS_DIRSYNC(dir))
- err = write_one_page(page);
- else
- unlock_page(page);
-
- return err;
-}
-
-static bool exofs_check_page(struct page *page)
-{
- struct inode *dir = page->mapping->host;
- unsigned chunk_size = exofs_chunk_size(dir);
- char *kaddr = page_address(page);
- unsigned offs, rec_len;
- unsigned limit = PAGE_SIZE;
- struct exofs_dir_entry *p;
- char *error;
-
- /* if the page is the last one in the directory */
- if ((dir->i_size >> PAGE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_MASK;
- if (limit & (chunk_size - 1))
- goto Ebadsize;
- if (!limit)
- goto out;
- }
- for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
- p = (struct exofs_dir_entry *)(kaddr + offs);
- rec_len = le16_to_cpu(p->rec_len);
-
- if (rec_len < EXOFS_DIR_REC_LEN(1))
- goto Eshort;
- if (rec_len & 3)
- goto Ealign;
- if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
- goto Enamelen;
- if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
- goto Espan;
- }
- if (offs != limit)
- goto Eend;
-out:
- SetPageChecked(page);
- return true;
-
-Ebadsize:
- EXOFS_ERR("ERROR [exofs_check_page]: "
- "size of directory(0x%lx) is not a multiple of chunk size\n",
- dir->i_ino
- );
- goto fail;
-Eshort:
- error = "rec_len is smaller than minimal";
- goto bad_entry;
-Ealign:
- error = "unaligned directory entry";
- goto bad_entry;
-Enamelen:
- error = "rec_len is too small for name_len";
- goto bad_entry;
-Espan:
- error = "directory entry across blocks";
- goto bad_entry;
-bad_entry:
- EXOFS_ERR(
- "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
- "offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n",
- dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
- _LLU(le64_to_cpu(p->inode_no)),
- rec_len, p->name_len);
- goto fail;
-Eend:
- p = (struct exofs_dir_entry *)(kaddr + offs);
- EXOFS_ERR("ERROR [exofs_check_page]: "
- "entry in directory(0x%lx) spans the page boundary"
- "offset=%lu, inode=0x%llx\n",
- dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
- _LLU(le64_to_cpu(p->inode_no)));
-fail:
- SetPageError(page);
- return false;
-}
-
-static struct page *exofs_get_page(struct inode *dir, unsigned long n)
-{
- struct address_space *mapping = dir->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
-
- if (!IS_ERR(page)) {
- kmap(page);
- if (unlikely(!PageChecked(page))) {
- if (PageError(page) || !exofs_check_page(page))
- goto fail;
- }
- }
- return page;
-
-fail:
- exofs_put_page(page);
- return ERR_PTR(-EIO);
-}
-
-static inline int exofs_match(int len, const unsigned char *name,
- struct exofs_dir_entry *de)
-{
- if (len != de->name_len)
- return 0;
- if (!de->inode_no)
- return 0;
- return !memcmp(name, de->name, len);
-}
-
-static inline
-struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
-{
- return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
-}
-
-static inline unsigned
-exofs_validate_entry(char *base, unsigned offset, unsigned mask)
-{
- struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
- struct exofs_dir_entry *p =
- (struct exofs_dir_entry *)(base + (offset&mask));
- while ((char *)p < (char *)de) {
- if (p->rec_len == 0)
- break;
- p = exofs_next_entry(p);
- }
- return (char *)p - base;
-}
-
-static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
- [EXOFS_FT_UNKNOWN] = DT_UNKNOWN,
- [EXOFS_FT_REG_FILE] = DT_REG,
- [EXOFS_FT_DIR] = DT_DIR,
- [EXOFS_FT_CHRDEV] = DT_CHR,
- [EXOFS_FT_BLKDEV] = DT_BLK,
- [EXOFS_FT_FIFO] = DT_FIFO,
- [EXOFS_FT_SOCK] = DT_SOCK,
- [EXOFS_FT_SYMLINK] = DT_LNK,
-};
-
-#define S_SHIFT 12
-static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK,
-};
-
-static inline
-void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
-{
- umode_t mode = inode->i_mode;
- de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
-}
-
-static int
-exofs_readdir(struct file *file, struct dir_context *ctx)
-{
- loff_t pos = ctx->pos;
- struct inode *inode = file_inode(file);
- unsigned int offset = pos & ~PAGE_MASK;
- unsigned long n = pos >> PAGE_SHIFT;
- unsigned long npages = dir_pages(inode);
- unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
- bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
-
- if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
- return 0;
-
- for ( ; n < npages; n++, offset = 0) {
- char *kaddr, *limit;
- struct exofs_dir_entry *de;
- struct page *page = exofs_get_page(inode, n);
-
- if (IS_ERR(page)) {
- EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
- inode->i_ino);
- ctx->pos += PAGE_SIZE - offset;
- return PTR_ERR(page);
- }
- kaddr = page_address(page);
- if (unlikely(need_revalidate)) {
- if (offset) {
- offset = exofs_validate_entry(kaddr, offset,
- chunk_mask);
- ctx->pos = (n<<PAGE_SHIFT) + offset;
- }
- file->f_version = inode_query_iversion(inode);
- need_revalidate = false;
- }
- de = (struct exofs_dir_entry *)(kaddr + offset);
- limit = kaddr + exofs_last_byte(inode, n) -
- EXOFS_DIR_REC_LEN(1);
- for (; (char *)de <= limit; de = exofs_next_entry(de)) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: "
- "zero-length entry in directory(0x%lx)\n",
- inode->i_ino);
- exofs_put_page(page);
- return -EIO;
- }
- if (de->inode_no) {
- unsigned char t;
-
- if (de->file_type < EXOFS_FT_MAX)
- t = exofs_filetype_table[de->file_type];
- else
- t = DT_UNKNOWN;
-
- if (!dir_emit(ctx, de->name, de->name_len,
- le64_to_cpu(de->inode_no),
- t)) {
- exofs_put_page(page);
- return 0;
- }
- }
- ctx->pos += le16_to_cpu(de->rec_len);
- }
- exofs_put_page(page);
- }
- return 0;
-}
-
-struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
- struct dentry *dentry, struct page **res_page)
-{
- const unsigned char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
- unsigned long start, n;
- unsigned long npages = dir_pages(dir);
- struct page *page = NULL;
- struct exofs_i_info *oi = exofs_i(dir);
- struct exofs_dir_entry *de;
-
- if (npages == 0)
- goto out;
-
- *res_page = NULL;
-
- start = oi->i_dir_start_lookup;
- if (start >= npages)
- start = 0;
- n = start;
- do {
- char *kaddr;
- page = exofs_get_page(dir, n);
- if (!IS_ERR(page)) {
- kaddr = page_address(page);
- de = (struct exofs_dir_entry *) kaddr;
- kaddr += exofs_last_byte(dir, n) - reclen;
- while ((char *) de <= kaddr) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: zero-length entry in "
- "directory(0x%lx)\n",
- dir->i_ino);
- exofs_put_page(page);
- goto out;
- }
- if (exofs_match(namelen, name, de))
- goto found;
- de = exofs_next_entry(de);
- }
- exofs_put_page(page);
- }
- if (++n >= npages)
- n = 0;
- } while (n != start);
-out:
- return NULL;
-
-found:
- *res_page = page;
- oi->i_dir_start_lookup = n;
- return de;
-}
-
-struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
-{
- struct page *page = exofs_get_page(dir, 0);
- struct exofs_dir_entry *de = NULL;
-
- if (!IS_ERR(page)) {
- de = exofs_next_entry(
- (struct exofs_dir_entry *)page_address(page));
- *p = page;
- }
- return de;
-}
-
-ino_t exofs_parent_ino(struct dentry *child)
-{
- struct page *page;
- struct exofs_dir_entry *de;
- ino_t ino;
-
- de = exofs_dotdot(d_inode(child), &page);
- if (!de)
- return 0;
-
- ino = le64_to_cpu(de->inode_no);
- exofs_put_page(page);
- return ino;
-}
-
-ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
-{
- ino_t res = 0;
- struct exofs_dir_entry *de;
- struct page *page;
-
- de = exofs_find_entry(dir, dentry, &page);
- if (de) {
- res = le64_to_cpu(de->inode_no);
- exofs_put_page(page);
- }
- return res;
-}
-
-int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
- struct page *page, struct inode *inode)
-{
- loff_t pos = page_offset(page) +
- (char *) de - (char *) page_address(page);
- unsigned len = le16_to_cpu(de->rec_len);
- int err;
-
- lock_page(page);
- err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL);
- if (err)
- EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
- err);
-
- de->inode_no = cpu_to_le64(inode->i_ino);
- exofs_set_de_type(de, inode);
- if (likely(!err))
- err = exofs_commit_chunk(page, pos, len);
- exofs_put_page(page);
- dir->i_mtime = dir->i_ctime = current_time(dir);
- mark_inode_dirty(dir);
- return err;
-}
-
-int exofs_add_link(struct dentry *dentry, struct inode *inode)
-{
- struct inode *dir = d_inode(dentry->d_parent);
- const unsigned char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- unsigned chunk_size = exofs_chunk_size(dir);
- unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
- unsigned short rec_len, name_len;
- struct page *page = NULL;
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- struct exofs_dir_entry *de;
- unsigned long npages = dir_pages(dir);
- unsigned long n;
- char *kaddr;
- loff_t pos;
- int err;
-
- for (n = 0; n <= npages; n++) {
- char *dir_end;
-
- page = exofs_get_page(dir, n);
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto out;
- lock_page(page);
- kaddr = page_address(page);
- dir_end = kaddr + exofs_last_byte(dir, n);
- de = (struct exofs_dir_entry *)kaddr;
- kaddr += PAGE_SIZE - reclen;
- while ((char *)de <= kaddr) {
- if ((char *)de == dir_end) {
- name_len = 0;
- rec_len = chunk_size;
- de->rec_len = cpu_to_le16(chunk_size);
- de->inode_no = 0;
- goto got_it;
- }
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: exofs_add_link: "
- "zero-length entry in directory(0x%lx)\n",
- inode->i_ino);
- err = -EIO;
- goto out_unlock;
- }
- err = -EEXIST;
- if (exofs_match(namelen, name, de))
- goto out_unlock;
- name_len = EXOFS_DIR_REC_LEN(de->name_len);
- rec_len = le16_to_cpu(de->rec_len);
- if (!de->inode_no && rec_len >= reclen)
- goto got_it;
- if (rec_len >= name_len + reclen)
- goto got_it;
- de = (struct exofs_dir_entry *) ((char *) de + rec_len);
- }
- unlock_page(page);
- exofs_put_page(page);
- }
-
- EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
- dentry, inode->i_ino);
- return -EINVAL;
-
-got_it:
- pos = page_offset(page) +
- (char *)de - (char *)page_address(page);
- err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
- &page, NULL);
- if (err)
- goto out_unlock;
- if (de->inode_no) {
- struct exofs_dir_entry *de1 =
- (struct exofs_dir_entry *)((char *)de + name_len);
- de1->rec_len = cpu_to_le16(rec_len - name_len);
- de->rec_len = cpu_to_le16(name_len);
- de = de1;
- }
- de->name_len = namelen;
- memcpy(de->name, name, namelen);
- de->inode_no = cpu_to_le64(inode->i_ino);
- exofs_set_de_type(de, inode);
- err = exofs_commit_chunk(page, pos, rec_len);
- dir->i_mtime = dir->i_ctime = current_time(dir);
- mark_inode_dirty(dir);
- sbi->s_numfiles++;
-
-out_put:
- exofs_put_page(page);
-out:
- return err;
-out_unlock:
- unlock_page(page);
- goto out_put;
-}
-
-int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
-{
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- char *kaddr = page_address(page);
- unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
- unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
- loff_t pos;
- struct exofs_dir_entry *pde = NULL;
- struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
- int err;
-
- while (de < dir) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: exofs_delete_entry:"
- "zero-length entry in directory(0x%lx)\n",
- inode->i_ino);
- err = -EIO;
- goto out;
- }
- pde = de;
- de = exofs_next_entry(de);
- }
- if (pde)
- from = (char *)pde - (char *)page_address(page);
- pos = page_offset(page) + from;
- lock_page(page);
- err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
- &page, NULL);
- if (err)
- EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
- err);
- if (pde)
- pde->rec_len = cpu_to_le16(to - from);
- dir->inode_no = 0;
- if (likely(!err))
- err = exofs_commit_chunk(page, pos, to - from);
- inode->i_ctime = inode->i_mtime = current_time(inode);
- mark_inode_dirty(inode);
- sbi->s_numfiles--;
-out:
- exofs_put_page(page);
- return err;
-}
-
-/* kept aligned on 4 bytes */
-#define THIS_DIR ".\0\0"
-#define PARENT_DIR "..\0"
-
-int exofs_make_empty(struct inode *inode, struct inode *parent)
-{
- struct address_space *mapping = inode->i_mapping;
- struct page *page = grab_cache_page(mapping, 0);
- unsigned chunk_size = exofs_chunk_size(inode);
- struct exofs_dir_entry *de;
- int err;
- void *kaddr;
-
- if (!page)
- return -ENOMEM;
-
- err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
- &page, NULL);
- if (err) {
- unlock_page(page);
- goto fail;
- }
-
- kaddr = kmap_atomic(page);
- de = (struct exofs_dir_entry *)kaddr;
- de->name_len = 1;
- de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
- memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
- de->inode_no = cpu_to_le64(inode->i_ino);
- exofs_set_de_type(de, inode);
-
- de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
- de->name_len = 2;
- de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
- de->inode_no = cpu_to_le64(parent->i_ino);
- memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
- exofs_set_de_type(de, inode);
- kunmap_atomic(kaddr);
- err = exofs_commit_chunk(page, 0, chunk_size);
-fail:
- put_page(page);
- return err;
-}
-
-int exofs_empty_dir(struct inode *inode)
-{
- struct page *page = NULL;
- unsigned long i, npages = dir_pages(inode);
-
- for (i = 0; i < npages; i++) {
- char *kaddr;
- struct exofs_dir_entry *de;
- page = exofs_get_page(inode, i);
-
- if (IS_ERR(page))
- continue;
-
- kaddr = page_address(page);
- de = (struct exofs_dir_entry *)kaddr;
- kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
-
- while ((char *)de <= kaddr) {
- if (de->rec_len == 0) {
- EXOFS_ERR("ERROR: exofs_empty_dir: "
- "zero-length directory entry"
- "kaddr=%p, de=%p\n", kaddr, de);
- goto not_empty;
- }
- if (de->inode_no != 0) {
- /* check for . and .. */
- if (de->name[0] != '.')
- goto not_empty;
- if (de->name_len > 2)
- goto not_empty;
- if (de->name_len < 2) {
- if (le64_to_cpu(de->inode_no) !=
- inode->i_ino)
- goto not_empty;
- } else if (de->name[1] != '.')
- goto not_empty;
- }
- de = exofs_next_entry(de);
- }
- exofs_put_page(page);
- }
- return 1;
-
-not_empty:
- exofs_put_page(page);
- return 0;
-}
-
-const struct file_operations exofs_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .iterate_shared = exofs_readdir,
-};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
deleted file mode 100644
index 5dc392404559..000000000000
--- a/fs/exofs/exofs.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __EXOFS_H__
-#define __EXOFS_H__
-
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/backing-dev.h>
-#include <scsi/osd_ore.h>
-
-#include "common.h"
-
-#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
-
-#ifdef CONFIG_EXOFS_DEBUG
-#define EXOFS_DBGMSG(fmt, a...) \
- printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define EXOFS_DBGMSG(fmt, a...) \
- do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
-
-struct exofs_dev {
- struct ore_dev ored;
- unsigned did;
- unsigned urilen;
- uint8_t *uri;
- struct kobject ed_kobj;
-};
-/*
- * our extension to the in-memory superblock
- */
-struct exofs_sb_info {
- struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
- int s_timeout; /* timeout for OSD operations */
- uint64_t s_nextid; /* highest object ID used */
- uint32_t s_numfiles; /* number of files on fs */
- spinlock_t s_next_gen_lock; /* spinlock for gen # update */
- u32 s_next_generation; /* next gen # to use */
- atomic_t s_curr_pending; /* number of pending commands */
-
- struct ore_layout layout; /* Default files layout */
- struct ore_comp one_comp; /* id & cred of partition id=0*/
- struct ore_components oc; /* comps for the partition */
- struct kobject s_kobj; /* holds per-sbi kobject */
-};
-
-/*
- * our extension to the in-memory inode
- */
-struct exofs_i_info {
- struct inode vfs_inode; /* normal in-memory inode */
- wait_queue_head_t i_wq; /* wait queue for inode */
- unsigned long i_flags; /* various atomic flags */
- uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
- uint32_t i_dir_start_lookup; /* which page to start lookup */
- uint64_t i_commit_size; /* the object's written length */
- struct ore_comp one_comp; /* same component for all devices */
- struct ore_components oc; /* inode view of the device table */
-};
-
-static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
-{
- return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
-}
-
-/*
- * our inode flags
- */
-#define OBJ_2BCREATED 0 /* object will be created soon*/
-#define OBJ_CREATED 1 /* object has been created on the osd*/
-
-static inline int obj_2bcreated(struct exofs_i_info *oi)
-{
- return test_bit(OBJ_2BCREATED, &oi->i_flags);
-}
-
-static inline void set_obj_2bcreated(struct exofs_i_info *oi)
-{
- set_bit(OBJ_2BCREATED, &oi->i_flags);
-}
-
-static inline int obj_created(struct exofs_i_info *oi)
-{
- return test_bit(OBJ_CREATED, &oi->i_flags);
-}
-
-static inline void set_obj_created(struct exofs_i_info *oi)
-{
- set_bit(OBJ_CREATED, &oi->i_flags);
-}
-
-int __exofs_wait_obj_created(struct exofs_i_info *oi);
-static inline int wait_obj_created(struct exofs_i_info *oi)
-{
- if (likely(obj_created(oi)))
- return 0;
-
- return __exofs_wait_obj_created(oi);
-}
-
-/*
- * get to our inode from the vfs inode
- */
-static inline struct exofs_i_info *exofs_i(struct inode *inode)
-{
- return container_of(inode, struct exofs_i_info, vfs_inode);
-}
-
-/*
- * Maximum count of links to a file
- */
-#define EXOFS_LINK_MAX 32000
-
-/*************************
- * function declarations *
- *************************/
-
-/* inode.c */
-unsigned exofs_max_io_pages(struct ore_layout *layout,
- unsigned expected_pages);
-int exofs_setattr(struct dentry *, struct iattr *);
-int exofs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata);
-extern struct inode *exofs_iget(struct super_block *, unsigned long);
-struct inode *exofs_new_inode(struct inode *, umode_t);
-extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
-extern void exofs_evict_inode(struct inode *);
-
-/* dir.c: */
-int exofs_add_link(struct dentry *, struct inode *);
-ino_t exofs_inode_by_name(struct inode *, struct dentry *);
-int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
-int exofs_make_empty(struct inode *, struct inode *);
-struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
- struct page **);
-int exofs_empty_dir(struct inode *);
-struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
-ino_t exofs_parent_ino(struct dentry *child);
-int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
- struct inode *);
-
-/* super.c */
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
- const struct osd_obj_id *obj);
-int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
-
-/* sys.c */
-int exofs_sysfs_init(void);
-void exofs_sysfs_uninit(void);
-int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
- struct exofs_dt_device_info *dt_dev);
-void exofs_sysfs_sb_del(struct exofs_sb_info *sbi);
-int exofs_sysfs_odev_add(struct exofs_dev *edev,
- struct exofs_sb_info *sbi);
-void exofs_sysfs_dbg_print(void);
-
-/*********************
- * operation vectors *
- *********************/
-/* dir.c: */
-extern const struct file_operations exofs_dir_operations;
-
-/* file.c */
-extern const struct inode_operations exofs_file_inode_operations;
-extern const struct file_operations exofs_file_operations;
-
-/* inode.c */
-extern const struct address_space_operations exofs_aops;
-
-/* namei.c */
-extern const struct inode_operations exofs_dir_inode_operations;
-extern const struct inode_operations exofs_special_inode_operations;
-
-/* exofs_init_comps will initialize an ore_components device array
- * pointing to a single ore_comp struct, and a round-robin view
- * of the device table.
- * The first device of each inode is the [inode->ino % num_devices]
- * and the rest of the devices sequentially following where the
- * first device is after the last device.
- * It is assumed that the global device array at @sbi is twice
- * bigger and that the device table repeats twice.
- * See: exofs_read_lookup_dev_table()
- */
-static inline void exofs_init_comps(struct ore_components *oc,
- struct ore_comp *one_comp,
- struct exofs_sb_info *sbi, osd_id oid)
-{
- unsigned dev_mod = (unsigned)oid, first_dev;
-
- one_comp->obj.partition = sbi->one_comp.obj.partition;
- one_comp->obj.id = oid;
- exofs_make_credential(one_comp->cred, &one_comp->obj);
-
- oc->first_dev = 0;
- oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
- sbi->layout.group_count;
- oc->single_comp = EC_SINGLE_COMP;
- oc->comps = one_comp;
-
- /* Round robin device view of the table */
- first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
- oc->ods = &sbi->oc.ods[first_dev];
-}
-
-#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
deleted file mode 100644
index a94594ea2aa3..000000000000
--- a/fs/exofs/file.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "exofs.h"
-
-static int exofs_release_file(struct inode *inode, struct file *filp)
-{
- return 0;
-}
-
-/* exofs_file_fsync - flush the inode to disk
- *
- * Note, in exofs all metadata is written as part of inode, regardless.
- * The writeout is synchronous
- */
-static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *inode = filp->f_mapping->host;
- int ret;
-
- ret = file_write_and_wait_range(filp, start, end);
- if (ret)
- return ret;
-
- inode_lock(inode);
- ret = sync_inode_metadata(filp->f_mapping->host, 1);
- inode_unlock(inode);
- return ret;
-}
-
-static int exofs_flush(struct file *file, fl_owner_t id)
-{
- int ret = vfs_fsync(file, 0);
- /* TODO: Flush the OSD target */
- return ret;
-}
-
-const struct file_operations exofs_file_operations = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
- .open = generic_file_open,
- .release = exofs_release_file,
- .fsync = exofs_file_fsync,
- .flush = exofs_flush,
- .splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
-};
-
-const struct inode_operations exofs_file_inode_operations = {
- .setattr = exofs_setattr,
-};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
deleted file mode 100644
index 5f81fcd383a4..000000000000
--- a/fs/exofs/inode.c
+++ /dev/null
@@ -1,1514 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/slab.h>
-
-#include "exofs.h"
-
-#define EXOFS_DBGMSG2(M...) do {} while (0)
-
-unsigned exofs_max_io_pages(struct ore_layout *layout,
- unsigned expected_pages)
-{
- unsigned pages = min_t(unsigned, expected_pages,
- layout->max_io_length / PAGE_SIZE);
-
- return pages;
-}
-
-struct page_collect {
- struct exofs_sb_info *sbi;
- struct inode *inode;
- unsigned expected_pages;
- struct ore_io_state *ios;
-
- struct page **pages;
- unsigned alloc_pages;
- unsigned nr_pages;
- unsigned long length;
- loff_t pg_first; /* keep 64bit also in 32-arches */
- bool read_4_write; /* This means two things: that the read is sync
- * And the pages should not be unlocked.
- */
- struct page *that_locked_page;
-};
-
-static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
- struct inode *inode)
-{
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
-
- pcol->sbi = sbi;
- pcol->inode = inode;
- pcol->expected_pages = expected_pages;
-
- pcol->ios = NULL;
- pcol->pages = NULL;
- pcol->alloc_pages = 0;
- pcol->nr_pages = 0;
- pcol->length = 0;
- pcol->pg_first = -1;
- pcol->read_4_write = false;
- pcol->that_locked_page = NULL;
-}
-
-static void _pcol_reset(struct page_collect *pcol)
-{
- pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
-
- pcol->pages = NULL;
- pcol->alloc_pages = 0;
- pcol->nr_pages = 0;
- pcol->length = 0;
- pcol->pg_first = -1;
- pcol->ios = NULL;
- pcol->that_locked_page = NULL;
-
- /* this is probably the end of the loop but in writes
- * it might not end here. don't be left with nothing
- */
- if (!pcol->expected_pages)
- pcol->expected_pages =
- exofs_max_io_pages(&pcol->sbi->layout, ~0);
-}
-
-static int pcol_try_alloc(struct page_collect *pcol)
-{
- unsigned pages;
-
- /* TODO: easily support bio chaining */
- pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
-
- for (; pages; pages >>= 1) {
- pcol->pages = kmalloc_array(pages, sizeof(struct page *),
- GFP_KERNEL);
- if (likely(pcol->pages)) {
- pcol->alloc_pages = pages;
- return 0;
- }
- }
-
- EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
- pcol->expected_pages);
- return -ENOMEM;
-}
-
-static void pcol_free(struct page_collect *pcol)
-{
- kfree(pcol->pages);
- pcol->pages = NULL;
-
- if (pcol->ios) {
- ore_put_io_state(pcol->ios);
- pcol->ios = NULL;
- }
-}
-
-static int pcol_add_page(struct page_collect *pcol, struct page *page,
- unsigned len)
-{
- if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
- return -ENOMEM;
-
- pcol->pages[pcol->nr_pages++] = page;
- pcol->length += len;
- return 0;
-}
-
-enum {PAGE_WAS_NOT_IN_IO = 17};
-static int update_read_page(struct page *page, int ret)
-{
- switch (ret) {
- case 0:
- /* Everything is OK */
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
- break;
- case -EFAULT:
- /* In this case we were trying to read something that wasn't on
- * disk yet - return a page full of zeroes. This should be OK,
- * because the object should be empty (if there was a write
- * before this read, the read would be waiting with the page
- * locked */
- clear_highpage(page);
-
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
- EXOFS_DBGMSG("recovered read error\n");
- /* fall through */
- case PAGE_WAS_NOT_IN_IO:
- ret = 0; /* recovered error */
- break;
- default:
- SetPageError(page);
- }
- return ret;
-}
-
-static void update_write_page(struct page *page, int ret)
-{
- if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
- return; /* don't pass start don't collect $200 */
-
- if (ret) {
- mapping_set_error(page->mapping, ret);
- SetPageError(page);
- }
- end_page_writeback(page);
-}
-
-/* Called at the end of reads, to optionally unlock pages and update their
- * status.
- */
-static int __readpages_done(struct page_collect *pcol)
-{
- int i;
- u64 good_bytes;
- u64 length = 0;
- int ret = ore_check_io(pcol->ios, NULL);
-
- if (likely(!ret)) {
- good_bytes = pcol->length;
- ret = PAGE_WAS_NOT_IN_IO;
- } else {
- good_bytes = 0;
- }
-
- EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
- " length=0x%lx nr_pages=%u\n",
- pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
- pcol->nr_pages);
-
- for (i = 0; i < pcol->nr_pages; i++) {
- struct page *page = pcol->pages[i];
- struct inode *inode = page->mapping->host;
- int page_stat;
-
- if (inode != pcol->inode)
- continue; /* osd might add more pages at end */
-
- if (likely(length < good_bytes))
- page_stat = 0;
- else
- page_stat = ret;
-
- EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n",
- inode->i_ino, page->index,
- page_stat ? "bad_bytes" : "good_bytes");
-
- ret = update_read_page(page, page_stat);
- if (!pcol->read_4_write)
- unlock_page(page);
- length += PAGE_SIZE;
- }
-
- pcol_free(pcol);
- EXOFS_DBGMSG2("readpages_done END\n");
- return ret;
-}
-
-/* callback of async reads */
-static void readpages_done(struct ore_io_state *ios, void *p)
-{
- struct page_collect *pcol = p;
-
- __readpages_done(pcol);
- atomic_dec(&pcol->sbi->s_curr_pending);
- kfree(pcol);
-}
-
-static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
-{
- int i;
-
- for (i = 0; i < pcol->nr_pages; i++) {
- struct page *page = pcol->pages[i];
-
- if (rw == READ)
- update_read_page(page, ret);
- else
- update_write_page(page, ret);
-
- unlock_page(page);
- }
-}
-
-static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
- struct page_collect *pcol_src, struct page_collect *pcol)
-{
- /* length was wrong or offset was not page aligned */
- BUG_ON(pcol_src->nr_pages < ios->nr_pages);
-
- if (pcol_src->nr_pages > ios->nr_pages) {
- struct page **src_page;
- unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
- unsigned long len_less = pcol_src->length - ios->length;
- unsigned i;
- int ret;
-
- /* This IO was trimmed */
- pcol_src->nr_pages = ios->nr_pages;
- pcol_src->length = ios->length;
-
- /* Left over pages are passed to the next io */
- pcol->expected_pages += pages_less;
- pcol->nr_pages = pages_less;
- pcol->length = len_less;
- src_page = pcol_src->pages + pcol_src->nr_pages;
- pcol->pg_first = (*src_page)->index;
-
- ret = pcol_try_alloc(pcol);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < pages_less; ++i)
- pcol->pages[i] = *src_page++;
-
- EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
- "pages_less=0x%x expected_pages=0x%x "
- "next_offset=0x%llx next_len=0x%lx\n",
- pcol_src->nr_pages, pages_less, pcol->expected_pages,
- pcol->pg_first * PAGE_SIZE, pcol->length);
- }
- return 0;
-}
-
-static int read_exec(struct page_collect *pcol)
-{
- struct exofs_i_info *oi = exofs_i(pcol->inode);
- struct ore_io_state *ios;
- struct page_collect *pcol_copy = NULL;
- int ret;
-
- if (!pcol->pages)
- return 0;
-
- if (!pcol->ios) {
- int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
- pcol->pg_first << PAGE_SHIFT,
- pcol->length, &pcol->ios);
-
- if (ret)
- return ret;
- }
-
- ios = pcol->ios;
- ios->pages = pcol->pages;
-
- if (pcol->read_4_write) {
- ore_read(pcol->ios);
- return __readpages_done(pcol);
- }
-
- pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
- if (!pcol_copy) {
- ret = -ENOMEM;
- goto err;
- }
-
- *pcol_copy = *pcol;
- ios->done = readpages_done;
- ios->private = pcol_copy;
-
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
-
- ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
- if (unlikely(ret))
- goto err;
-
- EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
- pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
-
- ret = ore_read(ios);
- if (unlikely(ret))
- goto err;
-
- atomic_inc(&pcol->sbi->s_curr_pending);
-
- return 0;
-
-err:
- if (!pcol_copy) /* Failed before ownership transfer */
- pcol_copy = pcol;
- _unlock_pcol_pages(pcol_copy, ret, READ);
- pcol_free(pcol_copy);
- kfree(pcol_copy);
-
- return ret;
-}
-
-/* readpage_strip is called either directly from readpage() or by the VFS from
- * within read_cache_pages(), to add one more page to be read. It will try to
- * collect as many contiguous pages as posible. If a discontinuity is
- * encountered, or it runs out of resources, it will submit the previous segment
- * and will start a new collection. Eventually caller must submit the last
- * segment if present.
- */
-static int readpage_strip(void *data, struct page *page)
-{
- struct page_collect *pcol = data;
- struct inode *inode = pcol->inode;
- struct exofs_i_info *oi = exofs_i(inode);
- loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
- size_t len;
- int ret;
-
- BUG_ON(!PageLocked(page));
-
- /* FIXME: Just for debugging, will be removed */
- if (PageUptodate(page))
- EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
- page->index);
-
- pcol->that_locked_page = page;
-
- if (page->index < end_index)
- len = PAGE_SIZE;
- else if (page->index == end_index)
- len = i_size & ~PAGE_MASK;
- else
- len = 0;
-
- if (!len || !obj_created(oi)) {
- /* this will be out of bounds, or doesn't exist yet.
- * Current page is cleared and the request is split
- */
- clear_highpage(page);
-
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
-
- if (!pcol->read_4_write)
- unlock_page(page);
- EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
- "read_4_write=%d index=0x%lx end_index=0x%lx "
- "splitting\n", inode->i_ino, len,
- pcol->read_4_write, page->index, end_index);
-
- return read_exec(pcol);
- }
-
-try_again:
-
- if (unlikely(pcol->pg_first == -1)) {
- pcol->pg_first = page->index;
- } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
- page->index)) {
- /* Discontinuity detected, split the request */
- ret = read_exec(pcol);
- if (unlikely(ret))
- goto fail;
- goto try_again;
- }
-
- if (!pcol->pages) {
- ret = pcol_try_alloc(pcol);
- if (unlikely(ret))
- goto fail;
- }
-
- if (len != PAGE_SIZE)
- zero_user(page, len, PAGE_SIZE - len);
-
- EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
- inode->i_ino, page->index, len);
-
- ret = pcol_add_page(pcol, page, len);
- if (ret) {
- EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
- "this_len=0x%zx nr_pages=%u length=0x%lx\n",
- page, len, pcol->nr_pages, pcol->length);
-
- /* split the request, and start again with current page */
- ret = read_exec(pcol);
- if (unlikely(ret))
- goto fail;
-
- goto try_again;
- }
-
- return 0;
-
-fail:
- /* SetPageError(page); ??? */
- unlock_page(page);
- return ret;
-}
-
-static int exofs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
-{
- struct page_collect pcol;
- int ret;
-
- _pcol_init(&pcol, nr_pages, mapping->host);
-
- ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
- if (ret) {
- EXOFS_ERR("read_cache_pages => %d\n", ret);
- return ret;
- }
-
- ret = read_exec(&pcol);
- if (unlikely(ret))
- return ret;
-
- return read_exec(&pcol);
-}
-
-static int _readpage(struct page *page, bool read_4_write)
-{
- struct page_collect pcol;
- int ret;
-
- _pcol_init(&pcol, 1, page->mapping->host);
-
- pcol.read_4_write = read_4_write;
- ret = readpage_strip(&pcol, page);
- if (ret) {
- EXOFS_ERR("_readpage => %d\n", ret);
- return ret;
- }
-
- return read_exec(&pcol);
-}
-
-/*
- * We don't need the file
- */
-static int exofs_readpage(struct file *file, struct page *page)
-{
- return _readpage(page, false);
-}
-
-/* Callback for osd_write. All writes are asynchronous */
-static void writepages_done(struct ore_io_state *ios, void *p)
-{
- struct page_collect *pcol = p;
- int i;
- u64 good_bytes;
- u64 length = 0;
- int ret = ore_check_io(ios, NULL);
-
- atomic_dec(&pcol->sbi->s_curr_pending);
-
- if (likely(!ret)) {
- good_bytes = pcol->length;
- ret = PAGE_WAS_NOT_IN_IO;
- } else {
- good_bytes = 0;
- }
-
- EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
- " length=0x%lx nr_pages=%u\n",
- pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
- pcol->nr_pages);
-
- for (i = 0; i < pcol->nr_pages; i++) {
- struct page *page = pcol->pages[i];
- struct inode *inode = page->mapping->host;
- int page_stat;
-
- if (inode != pcol->inode)
- continue; /* osd might add more pages to a bio */
-
- if (likely(length < good_bytes))
- page_stat = 0;
- else
- page_stat = ret;
-
- update_write_page(page, page_stat);
- unlock_page(page);
- EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n",
- inode->i_ino, page->index, page_stat);
-
- length += PAGE_SIZE;
- }
-
- pcol_free(pcol);
- kfree(pcol);
- EXOFS_DBGMSG2("writepages_done END\n");
-}
-
-static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
-{
- struct page_collect *pcol = priv;
- pgoff_t index = offset / PAGE_SIZE;
-
- if (!pcol->that_locked_page ||
- (pcol->that_locked_page->index != index)) {
- struct page *page;
- loff_t i_size = i_size_read(pcol->inode);
-
- if (offset >= i_size) {
- *uptodate = true;
- EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
- return ZERO_PAGE(0);
- }
-
- page = find_get_page(pcol->inode->i_mapping, index);
- if (!page) {
- page = find_or_create_page(pcol->inode->i_mapping,
- index, GFP_NOFS);
- if (unlikely(!page)) {
- EXOFS_DBGMSG("grab_cache_page Failed "
- "index=0x%llx\n", _LLU(index));
- return NULL;
- }
- unlock_page(page);
- }
- *uptodate = PageUptodate(page);
- EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
- return page;
- } else {
- EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
- pcol->that_locked_page->index);
- *uptodate = true;
- return pcol->that_locked_page;
- }
-}
-
-static void __r4w_put_page(void *priv, struct page *page)
-{
- struct page_collect *pcol = priv;
-
- if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
- EXOFS_DBGMSG2("index=0x%lx\n", page->index);
- put_page(page);
- return;
- }
- EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
- ZERO_PAGE(0) == page ? -1 : page->index);
-}
-
-static const struct _ore_r4w_op _r4w_op = {
- .get_page = &__r4w_get_page,
- .put_page = &__r4w_put_page,
-};
-
-static int write_exec(struct page_collect *pcol)
-{
- struct exofs_i_info *oi = exofs_i(pcol->inode);
- struct ore_io_state *ios;
- struct page_collect *pcol_copy = NULL;
- int ret;
-
- if (!pcol->pages)
- return 0;
-
- BUG_ON(pcol->ios);
- ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
- pcol->pg_first << PAGE_SHIFT,
- pcol->length, &pcol->ios);
- if (unlikely(ret))
- goto err;
-
- pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
- if (!pcol_copy) {
- EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
- ret = -ENOMEM;
- goto err;
- }
-
- *pcol_copy = *pcol;
-
- ios = pcol->ios;
- ios->pages = pcol_copy->pages;
- ios->done = writepages_done;
- ios->r4w = &_r4w_op;
- ios->private = pcol_copy;
-
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
-
- ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
- if (unlikely(ret))
- goto err;
-
- EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
- pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
-
- ret = ore_write(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("write_exec: ore_write() Failed\n");
- goto err;
- }
-
- atomic_inc(&pcol->sbi->s_curr_pending);
- return 0;
-
-err:
- if (!pcol_copy) /* Failed before ownership transfer */
- pcol_copy = pcol;
- _unlock_pcol_pages(pcol_copy, ret, WRITE);
- pcol_free(pcol_copy);
- kfree(pcol_copy);
-
- return ret;
-}
-
-/* writepage_strip is called either directly from writepage() or by the VFS from
- * within write_cache_pages(), to add one more page to be written to storage.
- * It will try to collect as many contiguous pages as possible. If a
- * discontinuity is encountered or it runs out of resources it will submit the
- * previous segment and will start a new collection.
- * Eventually caller must submit the last segment if present.
- */
-static int writepage_strip(struct page *page,
- struct writeback_control *wbc_unused, void *data)
-{
- struct page_collect *pcol = data;
- struct inode *inode = pcol->inode;
- struct exofs_i_info *oi = exofs_i(inode);
- loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
- size_t len;
- int ret;
-
- BUG_ON(!PageLocked(page));
-
- ret = wait_obj_created(oi);
- if (unlikely(ret))
- goto fail;
-
- if (page->index < end_index)
- /* in this case, the page is within the limits of the file */
- len = PAGE_SIZE;
- else {
- len = i_size & ~PAGE_MASK;
-
- if (page->index > end_index || !len) {
- /* in this case, the page is outside the limits
- * (truncate in progress)
- */
- ret = write_exec(pcol);
- if (unlikely(ret))
- goto fail;
- if (PageError(page))
- ClearPageError(page);
- unlock_page(page);
- EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
- "outside the limits\n",
- inode->i_ino, page->index);
- return 0;
- }
- }
-
-try_again:
-
- if (unlikely(pcol->pg_first == -1)) {
- pcol->pg_first = page->index;
- } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
- page->index)) {
- /* Discontinuity detected, split the request */
- ret = write_exec(pcol);
- if (unlikely(ret))
- goto fail;
-
- EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
- inode->i_ino, page->index);
- goto try_again;
- }
-
- if (!pcol->pages) {
- ret = pcol_try_alloc(pcol);
- if (unlikely(ret))
- goto fail;
- }
-
- EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
- inode->i_ino, page->index, len);
-
- ret = pcol_add_page(pcol, page, len);
- if (unlikely(ret)) {
- EXOFS_DBGMSG2("Failed pcol_add_page "
- "nr_pages=%u total_length=0x%lx\n",
- pcol->nr_pages, pcol->length);
-
- /* split the request, next loop will start again */
- ret = write_exec(pcol);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("write_exec failed => %d", ret);
- goto fail;
- }
-
- goto try_again;
- }
-
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
-
- return 0;
-
-fail:
- EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
- inode->i_ino, page->index, ret);
- mapping_set_error(page->mapping, -EIO);
- unlock_page(page);
- return ret;
-}
-
-static int exofs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct page_collect pcol;
- long start, end, expected_pages;
- int ret;
-
- start = wbc->range_start >> PAGE_SHIFT;
- end = (wbc->range_end == LLONG_MAX) ?
- start + mapping->nrpages :
- wbc->range_end >> PAGE_SHIFT;
-
- if (start || end)
- expected_pages = end - start + 1;
- else
- expected_pages = mapping->nrpages;
-
- if (expected_pages < 32L)
- expected_pages = 32L;
-
- EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
- "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
- mapping->host->i_ino, wbc->range_start, wbc->range_end,
- mapping->nrpages, start, end, expected_pages);
-
- _pcol_init(&pcol, expected_pages, mapping->host);
-
- ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
- if (unlikely(ret)) {
- EXOFS_ERR("write_cache_pages => %d\n", ret);
- return ret;
- }
-
- ret = write_exec(&pcol);
- if (unlikely(ret))
- return ret;
-
- if (wbc->sync_mode == WB_SYNC_ALL) {
- return write_exec(&pcol); /* pump the last reminder */
- } else if (pcol.nr_pages) {
- /* not SYNC let the reminder join the next writeout */
- unsigned i;
-
- for (i = 0; i < pcol.nr_pages; i++) {
- struct page *page = pcol.pages[i];
-
- end_page_writeback(page);
- set_page_dirty(page);
- unlock_page(page);
- }
- }
- return 0;
-}
-
-/*
-static int exofs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct page_collect pcol;
- int ret;
-
- _pcol_init(&pcol, 1, page->mapping->host);
-
- ret = writepage_strip(page, NULL, &pcol);
- if (ret) {
- EXOFS_ERR("exofs_writepage => %d\n", ret);
- return ret;
- }
-
- return write_exec(&pcol);
-}
-*/
-/* i_mutex held using inode->i_size directly */
-static void _write_failed(struct inode *inode, loff_t to)
-{
- if (to > inode->i_size)
- truncate_pagecache(inode, inode->i_size);
-}
-
-int exofs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- int ret = 0;
- struct page *page;
-
- page = *pagep;
- if (page == NULL) {
- page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT,
- flags);
- if (!page) {
- EXOFS_DBGMSG("grab_cache_page_write_begin failed\n");
- return -ENOMEM;
- }
- *pagep = page;
- }
-
- /* read modify write */
- if (!PageUptodate(page) && (len != PAGE_SIZE)) {
- loff_t i_size = i_size_read(mapping->host);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
-
- if (page->index > end_index) {
- clear_highpage(page);
- SetPageUptodate(page);
- } else {
- ret = _readpage(page, true);
- if (ret) {
- unlock_page(page);
- EXOFS_DBGMSG("__readpage failed\n");
- }
- }
- }
- return ret;
-}
-
-static int exofs_write_begin_export(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- *pagep = NULL;
-
- return exofs_write_begin(file, mapping, pos, len, flags, pagep,
- fsdata);
-}
-
-static int exofs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = mapping->host;
- loff_t last_pos = pos + copied;
-
- if (!PageUptodate(page)) {
- if (copied < len) {
- _write_failed(inode, pos + len);
- copied = 0;
- goto out;
- }
- SetPageUptodate(page);
- }
- if (last_pos > inode->i_size) {
- i_size_write(inode, last_pos);
- mark_inode_dirty(inode);
- }
- set_page_dirty(page);
-out:
- unlock_page(page);
- put_page(page);
- return copied;
-}
-
-static int exofs_releasepage(struct page *page, gfp_t gfp)
-{
- EXOFS_DBGMSG("page 0x%lx\n", page->index);
- WARN_ON(1);
- return 0;
-}
-
-static void exofs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
- page->index, offset, length);
- WARN_ON(1);
-}
-
-
- /* TODO: Should be easy enough to do proprly */
-static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- return 0;
-}
-
-const struct address_space_operations exofs_aops = {
- .readpage = exofs_readpage,
- .readpages = exofs_readpages,
- .writepage = NULL,
- .writepages = exofs_writepages,
- .write_begin = exofs_write_begin_export,
- .write_end = exofs_write_end,
- .releasepage = exofs_releasepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
- .invalidatepage = exofs_invalidatepage,
-
- /* Not implemented Yet */
- .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
- .direct_IO = exofs_direct_IO,
-
- /* With these NULL has special meaning or default is not exported */
- .migratepage = NULL,
- .launder_page = NULL,
- .is_partially_uptodate = NULL,
- .error_remove_page = NULL,
-};
-
-/******************************************************************************
- * INODE OPERATIONS
- *****************************************************************************/
-
-/*
- * Test whether an inode is a fast symlink.
- */
-static inline int exofs_inode_is_fast_symlink(struct inode *inode)
-{
- struct exofs_i_info *oi = exofs_i(inode);
-
- return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
-}
-
-static int _do_truncate(struct inode *inode, loff_t newsize)
-{
- struct exofs_i_info *oi = exofs_i(inode);
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- int ret;
-
- inode->i_mtime = inode->i_ctime = current_time(inode);
-
- ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
- if (likely(!ret))
- truncate_setsize(inode, newsize);
-
- EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
- inode->i_ino, newsize, ret);
- return ret;
-}
-
-/*
- * Set inode attributes - update size attribute on OSD if needed,
- * otherwise just call generic functions.
- */
-int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
- struct inode *inode = d_inode(dentry);
- int error;
-
- /* if we are about to modify an object, and it hasn't been
- * created yet, wait
- */
- error = wait_obj_created(exofs_i(inode));
- if (unlikely(error))
- return error;
-
- error = setattr_prepare(dentry, iattr);
- if (unlikely(error))
- return error;
-
- if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(inode)) {
- error = _do_truncate(inode, iattr->ia_size);
- if (unlikely(error))
- return error;
- }
-
- setattr_copy(inode, iattr);
- mark_inode_dirty(inode);
- return 0;
-}
-
-static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
- EXOFS_APAGE_FS_DATA,
- EXOFS_ATTR_INODE_FILE_LAYOUT,
- 0);
-static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
- EXOFS_APAGE_FS_DATA,
- EXOFS_ATTR_INODE_DIR_LAYOUT,
- 0);
-
-/*
- * Read the Linux inode info from the OSD, and return it as is. In exofs the
- * inode info is in an application specific page/attribute of the osd-object.
- */
-static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
- struct exofs_fcb *inode)
-{
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct osd_attr attrs[] = {
- [0] = g_attr_inode_data,
- [1] = g_attr_inode_file_layout,
- [2] = g_attr_inode_dir_layout,
- };
- struct ore_io_state *ios;
- struct exofs_on_disk_inode_layout *layout;
- int ret;
-
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- return ret;
- }
-
- attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
- attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
-
- ios->in_attr = attrs;
- ios->in_attr_len = ARRAY_SIZE(attrs);
-
- ret = ore_read(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
- _LLU(oi->one_comp.obj.id), ret);
- memset(inode, 0, sizeof(*inode));
- inode->i_mode = 0040000 | (0777 & ~022);
- /* If object is lost on target we might as well enable it's
- * delete.
- */
- ret = 0;
- goto out;
- }
-
- ret = extract_attr_from_ios(ios, &attrs[0]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
- goto out;
- }
- WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
- memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
-
- ret = extract_attr_from_ios(ios, &attrs[1]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
- goto out;
- }
- if (attrs[1].len) {
- layout = attrs[1].val_ptr;
- if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
- EXOFS_ERR("%s: unsupported files layout %d\n",
- __func__, layout->gen_func);
- ret = -ENOTSUPP;
- goto out;
- }
- }
-
- ret = extract_attr_from_ios(ios, &attrs[2]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
- goto out;
- }
- if (attrs[2].len) {
- layout = attrs[2].val_ptr;
- if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
- EXOFS_ERR("%s: unsupported meta-data layout %d\n",
- __func__, layout->gen_func);
- ret = -ENOTSUPP;
- goto out;
- }
- }
-
-out:
- ore_put_io_state(ios);
- return ret;
-}
-
-static void __oi_init(struct exofs_i_info *oi)
-{
- init_waitqueue_head(&oi->i_wq);
- oi->i_flags = 0;
-}
-/*
- * Fill in an inode read from the OSD and set it up for use
- */
-struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
-{
- struct exofs_i_info *oi;
- struct exofs_fcb fcb;
- struct inode *inode;
- int ret;
-
- inode = iget_locked(sb, ino);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
- oi = exofs_i(inode);
- __oi_init(oi);
- exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
- exofs_oi_objno(oi));
-
- /* read the inode from the osd */
- ret = exofs_get_inode(sb, oi, &fcb);
- if (ret)
- goto bad_inode;
-
- set_obj_created(oi);
-
- /* copy stuff from on-disk struct to in-memory struct */
- inode->i_mode = le16_to_cpu(fcb.i_mode);
- i_uid_write(inode, le32_to_cpu(fcb.i_uid));
- i_gid_write(inode, le32_to_cpu(fcb.i_gid));
- set_nlink(inode, le16_to_cpu(fcb.i_links_count));
- inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
- inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
- inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
- inode->i_ctime.tv_nsec =
- inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
- oi->i_commit_size = le64_to_cpu(fcb.i_size);
- i_size_write(inode, oi->i_commit_size);
- inode->i_blkbits = EXOFS_BLKSHIFT;
- inode->i_generation = le32_to_cpu(fcb.i_generation);
-
- oi->i_dir_start_lookup = 0;
-
- if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
- ret = -ESTALE;
- goto bad_inode;
- }
-
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (fcb.i_data[0])
- inode->i_rdev =
- old_decode_dev(le32_to_cpu(fcb.i_data[0]));
- else
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(fcb.i_data[1]));
- } else {
- memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
- }
-
- if (S_ISREG(inode->i_mode)) {
- inode->i_op = &exofs_file_inode_operations;
- inode->i_fop = &exofs_file_operations;
- inode->i_mapping->a_ops = &exofs_aops;
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = &exofs_dir_inode_operations;
- inode->i_fop = &exofs_dir_operations;
- inode->i_mapping->a_ops = &exofs_aops;
- } else if (S_ISLNK(inode->i_mode)) {
- if (exofs_inode_is_fast_symlink(inode)) {
- inode->i_op = &simple_symlink_inode_operations;
- inode->i_link = (char *)oi->i_data;
- } else {
- inode->i_op = &page_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &exofs_aops;
- }
- } else {
- inode->i_op = &exofs_special_inode_operations;
- if (fcb.i_data[0])
- init_special_inode(inode, inode->i_mode,
- old_decode_dev(le32_to_cpu(fcb.i_data[0])));
- else
- init_special_inode(inode, inode->i_mode,
- new_decode_dev(le32_to_cpu(fcb.i_data[1])));
- }
-
- unlock_new_inode(inode);
- return inode;
-
-bad_inode:
- iget_failed(inode);
- return ERR_PTR(ret);
-}
-
-int __exofs_wait_obj_created(struct exofs_i_info *oi)
-{
- if (!obj_created(oi)) {
- EXOFS_DBGMSG("!obj_created\n");
- BUG_ON(!obj_2bcreated(oi));
- wait_event(oi->i_wq, obj_created(oi));
- EXOFS_DBGMSG("wait_event done\n");
- }
- return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
-}
-
-/*
- * Callback function from exofs_new_inode(). The important thing is that we
- * set the obj_created flag so that other methods know that the object exists on
- * the OSD.
- */
-static void create_done(struct ore_io_state *ios, void *p)
-{
- struct inode *inode = p;
- struct exofs_i_info *oi = exofs_i(inode);
- struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
- int ret;
-
- ret = ore_check_io(ios, NULL);
- ore_put_io_state(ios);
-
- atomic_dec(&sbi->s_curr_pending);
-
- if (unlikely(ret)) {
- EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
- _LLU(exofs_oi_objno(oi)),
- _LLU(oi->one_comp.obj.partition));
- /*TODO: When FS is corrupted creation can fail, object already
- * exist. Get rid of this asynchronous creation, if exist
- * increment the obj counter and try the next object. Until we
- * succeed. All these dangling objects will be made into lost
- * files by chkfs.exofs
- */
- }
-
- set_obj_created(oi);
-
- wake_up(&oi->i_wq);
-}
-
-/*
- * Set up a new inode and create an object for it on the OSD
- */
-struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
-{
- struct super_block *sb = dir->i_sb;
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct inode *inode;
- struct exofs_i_info *oi;
- struct ore_io_state *ios;
- int ret;
-
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- oi = exofs_i(inode);
- __oi_init(oi);
-
- set_obj_2bcreated(oi);
-
- inode_init_owner(inode, dir, mode);
- inode->i_ino = sbi->s_nextid++;
- inode->i_blkbits = EXOFS_BLKSHIFT;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- oi->i_commit_size = inode->i_size = 0;
- spin_lock(&sbi->s_next_gen_lock);
- inode->i_generation = sbi->s_next_generation++;
- spin_unlock(&sbi->s_next_gen_lock);
- insert_inode_hash(inode);
-
- exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
- exofs_oi_objno(oi));
- exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
-
- mark_inode_dirty(inode);
-
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
- return ERR_PTR(ret);
- }
-
- ios->done = create_done;
- ios->private = inode;
-
- ret = ore_create(ios);
- if (ret) {
- ore_put_io_state(ios);
- return ERR_PTR(ret);
- }
- atomic_inc(&sbi->s_curr_pending);
-
- return inode;
-}
-
-/*
- * struct to pass two arguments to update_inode's callback
- */
-struct updatei_args {
- struct exofs_sb_info *sbi;
- struct exofs_fcb fcb;
-};
-
-/*
- * Callback function from exofs_update_inode().
- */
-static void updatei_done(struct ore_io_state *ios, void *p)
-{
- struct updatei_args *args = p;
-
- ore_put_io_state(ios);
-
- atomic_dec(&args->sbi->s_curr_pending);
-
- kfree(args);
-}
-
-/*
- * Write the inode to the OSD. Just fill up the struct, and set the attribute
- * synchronously or asynchronously depending on the do_sync flag.
- */
-static int exofs_update_inode(struct inode *inode, int do_sync)
-{
- struct exofs_i_info *oi = exofs_i(inode);
- struct super_block *sb = inode->i_sb;
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct ore_io_state *ios;
- struct osd_attr attr;
- struct exofs_fcb *fcb;
- struct updatei_args *args;
- int ret;
-
- args = kzalloc(sizeof(*args), GFP_KERNEL);
- if (!args) {
- EXOFS_DBGMSG("Failed kzalloc of args\n");
- return -ENOMEM;
- }
-
- fcb = &args->fcb;
-
- fcb->i_mode = cpu_to_le16(inode->i_mode);
- fcb->i_uid = cpu_to_le32(i_uid_read(inode));
- fcb->i_gid = cpu_to_le32(i_gid_read(inode));
- fcb->i_links_count = cpu_to_le16(inode->i_nlink);
- fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
- fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
- fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
- oi->i_commit_size = i_size_read(inode);
- fcb->i_size = cpu_to_le64(oi->i_commit_size);
- fcb->i_generation = cpu_to_le32(inode->i_generation);
-
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (old_valid_dev(inode->i_rdev)) {
- fcb->i_data[0] =
- cpu_to_le32(old_encode_dev(inode->i_rdev));
- fcb->i_data[1] = 0;
- } else {
- fcb->i_data[0] = 0;
- fcb->i_data[1] =
- cpu_to_le32(new_encode_dev(inode->i_rdev));
- fcb->i_data[2] = 0;
- }
- } else
- memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
-
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- goto free_args;
- }
-
- attr = g_attr_inode_data;
- attr.val_ptr = fcb;
- ios->out_attr_len = 1;
- ios->out_attr = &attr;
-
- wait_obj_created(oi);
-
- if (!do_sync) {
- args->sbi = sbi;
- ios->done = updatei_done;
- ios->private = args;
- }
-
- ret = ore_write(ios);
- if (!do_sync && !ret) {
- atomic_inc(&sbi->s_curr_pending);
- goto out; /* deallocation in updatei_done */
- }
-
- ore_put_io_state(ios);
-free_args:
- kfree(args);
-out:
- EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
- inode->i_ino, do_sync, ret);
- return ret;
-}
-
-int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
- return exofs_update_inode(inode, 1);
-}
-
-/*
- * Callback function from exofs_delete_inode() - don't have much cleaning up to
- * do.
- */
-static void delete_done(struct ore_io_state *ios, void *p)
-{
- struct exofs_sb_info *sbi = p;
-
- ore_put_io_state(ios);
-
- atomic_dec(&sbi->s_curr_pending);
-}
-
-/*
- * Called when the refcount of an inode reaches zero. We remove the object
- * from the OSD here. We make sure the object was created before we try and
- * delete it.
- */
-void exofs_evict_inode(struct inode *inode)
-{
- struct exofs_i_info *oi = exofs_i(inode);
- struct super_block *sb = inode->i_sb;
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct ore_io_state *ios;
- int ret;
-
- truncate_inode_pages_final(&inode->i_data);
-
- /* TODO: should do better here */
- if (inode->i_nlink || is_bad_inode(inode))
- goto no_delete;
-
- inode->i_size = 0;
- clear_inode(inode);
-
- /* if we are deleting an obj that hasn't been created yet, wait.
- * This also makes sure that create_done cannot be called with an
- * already evicted inode.
- */
- wait_obj_created(oi);
- /* ignore the error, attempt a remove anyway */
-
- /* Now Remove the OSD objects */
- ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
- return;
- }
-
- ios->done = delete_done;
- ios->private = sbi;
-
- ret = ore_remove(ios);
- if (ret) {
- EXOFS_ERR("%s: ore_remove failed\n", __func__);
- ore_put_io_state(ios);
- return;
- }
- atomic_inc(&sbi->s_curr_pending);
-
- return;
-
-no_delete:
- clear_inode(inode);
-}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
deleted file mode 100644
index 7295cd722770..000000000000
--- a/fs/exofs/namei.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "exofs.h"
-
-static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
-{
- int err = exofs_add_link(dentry, inode);
- if (!err) {
- d_instantiate(dentry, inode);
- return 0;
- }
- inode_dec_link_count(inode);
- iput(inode);
- return err;
-}
-
-static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- struct inode *inode;
- ino_t ino;
-
- if (dentry->d_name.len > EXOFS_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
- ino = exofs_inode_by_name(dir, dentry);
- inode = ino ? exofs_iget(dir->i_sb, ino) : NULL;
- return d_splice_alias(inode, dentry);
-}
-
-static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
-{
- struct inode *inode = exofs_new_inode(dir, mode);
- int err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- inode->i_op = &exofs_file_inode_operations;
- inode->i_fop = &exofs_file_operations;
- inode->i_mapping->a_ops = &exofs_aops;
- mark_inode_dirty(inode);
- err = exofs_add_nondir(dentry, inode);
- }
- return err;
-}
-
-static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
- dev_t rdev)
-{
- struct inode *inode;
- int err;
-
- inode = exofs_new_inode(dir, mode);
- err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- init_special_inode(inode, inode->i_mode, rdev);
- mark_inode_dirty(inode);
- err = exofs_add_nondir(dentry, inode);
- }
- return err;
-}
-
-static int exofs_symlink(struct inode *dir, struct dentry *dentry,
- const char *symname)
-{
- struct super_block *sb = dir->i_sb;
- int err = -ENAMETOOLONG;
- unsigned l = strlen(symname)+1;
- struct inode *inode;
- struct exofs_i_info *oi;
-
- if (l > sb->s_blocksize)
- goto out;
-
- inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out;
-
- oi = exofs_i(inode);
- if (l > sizeof(oi->i_data)) {
- /* slow symlink */
- inode->i_op = &page_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &exofs_aops;
- memset(oi->i_data, 0, sizeof(oi->i_data));
-
- err = page_symlink(inode, symname, l);
- if (err)
- goto out_fail;
- } else {
- /* fast symlink */
- inode->i_op = &simple_symlink_inode_operations;
- inode->i_link = (char *)oi->i_data;
- memcpy(oi->i_data, symname, l);
- inode->i_size = l-1;
- }
- mark_inode_dirty(inode);
-
- err = exofs_add_nondir(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- iput(inode);
- goto out;
-}
-
-static int exofs_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *dentry)
-{
- struct inode *inode = d_inode(old_dentry);
-
- inode->i_ctime = current_time(inode);
- inode_inc_link_count(inode);
- ihold(inode);
-
- return exofs_add_nondir(dentry, inode);
-}
-
-static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- struct inode *inode;
- int err;
-
- inode_inc_link_count(dir);
-
- inode = exofs_new_inode(dir, S_IFDIR | mode);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_dir;
-
- inode->i_op = &exofs_dir_inode_operations;
- inode->i_fop = &exofs_dir_operations;
- inode->i_mapping->a_ops = &exofs_aops;
-
- inode_inc_link_count(inode);
-
- err = exofs_make_empty(inode, dir);
- if (err)
- goto out_fail;
-
- err = exofs_add_link(dentry, inode);
- if (err)
- goto out_fail;
-
- d_instantiate(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- inode_dec_link_count(inode);
- iput(inode);
-out_dir:
- inode_dec_link_count(dir);
- goto out;
-}
-
-static int exofs_unlink(struct inode *dir, struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
- struct exofs_dir_entry *de;
- struct page *page;
- int err = -ENOENT;
-
- de = exofs_find_entry(dir, dentry, &page);
- if (!de)
- goto out;
-
- err = exofs_delete_entry(de, page);
- if (err)
- goto out;
-
- inode->i_ctime = dir->i_ctime;
- inode_dec_link_count(inode);
- err = 0;
-out:
- return err;
-}
-
-static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
- int err = -ENOTEMPTY;
-
- if (exofs_empty_dir(inode)) {
- err = exofs_unlink(dir, dentry);
- if (!err) {
- inode->i_size = 0;
- inode_dec_link_count(inode);
- inode_dec_link_count(dir);
- }
- }
- return err;
-}
-
-static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
-{
- struct inode *old_inode = d_inode(old_dentry);
- struct inode *new_inode = d_inode(new_dentry);
- struct page *dir_page = NULL;
- struct exofs_dir_entry *dir_de = NULL;
- struct page *old_page;
- struct exofs_dir_entry *old_de;
- int err = -ENOENT;
-
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
- old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
- if (!old_de)
- goto out;
-
- if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
- dir_de = exofs_dotdot(old_inode, &dir_page);
- if (!dir_de)
- goto out_old;
- }
-
- if (new_inode) {
- struct page *new_page;
- struct exofs_dir_entry *new_de;
-
- err = -ENOTEMPTY;
- if (dir_de && !exofs_empty_dir(new_inode))
- goto out_dir;
-
- err = -ENOENT;
- new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
- if (!new_de)
- goto out_dir;
- err = exofs_set_link(new_dir, new_de, new_page, old_inode);
- new_inode->i_ctime = current_time(new_inode);
- if (dir_de)
- drop_nlink(new_inode);
- inode_dec_link_count(new_inode);
- if (err)
- goto out_dir;
- } else {
- err = exofs_add_link(new_dentry, old_inode);
- if (err)
- goto out_dir;
- if (dir_de)
- inode_inc_link_count(new_dir);
- }
-
- old_inode->i_ctime = current_time(old_inode);
-
- exofs_delete_entry(old_de, old_page);
- mark_inode_dirty(old_inode);
-
- if (dir_de) {
- err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
- inode_dec_link_count(old_dir);
- if (err)
- goto out_dir;
- }
- return 0;
-
-
-out_dir:
- if (dir_de) {
- kunmap(dir_page);
- put_page(dir_page);
- }
-out_old:
- kunmap(old_page);
- put_page(old_page);
-out:
- return err;
-}
-
-const struct inode_operations exofs_dir_inode_operations = {
- .create = exofs_create,
- .lookup = exofs_lookup,
- .link = exofs_link,
- .unlink = exofs_unlink,
- .symlink = exofs_symlink,
- .mkdir = exofs_mkdir,
- .rmdir = exofs_rmdir,
- .mknod = exofs_mknod,
- .rename = exofs_rename,
- .setattr = exofs_setattr,
-};
-
-const struct inode_operations exofs_special_inode_operations = {
- .setattr = exofs_setattr,
-};
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
deleted file mode 100644
index 24a8e34882e9..000000000000
--- a/fs/exofs/ore.c
+++ /dev/null
@@ -1,1179 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <asm/div64.h>
-#include <linux/lcm.h>
-
-#include "ore_raid.h"
-
-MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>");
-MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
-MODULE_LICENSE("GPL");
-
-/* ore_verify_layout does a couple of things:
- * 1. Given a minimum number of needed parameters fixes up the rest of the
- * members to be operatonals for the ore. The needed parameters are those
- * that are defined by the pnfs-objects layout STD.
- * 2. Check to see if the current ore code actually supports these parameters
- * for example stripe_unit must be a multple of the system PAGE_SIZE,
- * and etc...
- * 3. Cache some havily used calculations that will be needed by users.
- */
-
-enum { BIO_MAX_PAGES_KMALLOC =
- (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
-
-int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
-{
- u64 stripe_length;
-
- switch (layout->raid_algorithm) {
- case PNFS_OSD_RAID_0:
- layout->parity = 0;
- break;
- case PNFS_OSD_RAID_5:
- layout->parity = 1;
- break;
- case PNFS_OSD_RAID_PQ:
- layout->parity = 2;
- break;
- case PNFS_OSD_RAID_4:
- default:
- ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
- layout->raid_algorithm);
- return -EINVAL;
- }
- if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
- ORE_ERR("Stripe Unit(0x%llx)"
- " must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(layout->stripe_unit), PAGE_SIZE);
- return -EINVAL;
- }
- if (layout->group_width) {
- if (!layout->group_depth) {
- ORE_ERR("group_depth == 0 && group_width != 0\n");
- return -EINVAL;
- }
- if (total_comps < (layout->group_width * layout->mirrors_p1)) {
- ORE_ERR("Data Map wrong, "
- "numdevs=%d < group_width=%d * mirrors=%d\n",
- total_comps, layout->group_width,
- layout->mirrors_p1);
- return -EINVAL;
- }
- layout->group_count = total_comps / layout->mirrors_p1 /
- layout->group_width;
- } else {
- if (layout->group_depth) {
- printk(KERN_NOTICE "Warning: group_depth ignored "
- "group_width == 0 && group_depth == %lld\n",
- _LLU(layout->group_depth));
- }
- layout->group_width = total_comps / layout->mirrors_p1;
- layout->group_depth = -1;
- layout->group_count = 1;
- }
-
- stripe_length = (u64)layout->group_width * layout->stripe_unit;
- if (stripe_length >= (1ULL << 32)) {
- ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
- _LLU(stripe_length));
- return -EINVAL;
- }
-
- layout->max_io_length =
- (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
- (layout->group_width - layout->parity);
- if (layout->parity) {
- unsigned stripe_length =
- (layout->group_width - layout->parity) *
- layout->stripe_unit;
-
- layout->max_io_length /= stripe_length;
- layout->max_io_length *= stripe_length;
- }
- ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
-
- return 0;
-}
-EXPORT_SYMBOL(ore_verify_layout);
-
-static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
-{
- return ios->oc->comps[index & ios->oc->single_comp].cred;
-}
-
-static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
-{
- return &ios->oc->comps[index & ios->oc->single_comp].obj;
-}
-
-static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
-{
- ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
- ios->oc->first_dev, ios->oc->numdevs, index,
- ios->oc->ods);
-
- return ore_comp_dev(ios->oc, index);
-}
-
-int _ore_get_io_state(struct ore_layout *layout,
- struct ore_components *oc, unsigned numdevs,
- unsigned sgs_per_dev, unsigned num_par_pages,
- struct ore_io_state **pios)
-{
- struct ore_io_state *ios;
- size_t size_ios, size_extra, size_total;
- void *ios_extra;
-
- /*
- * The desired layout looks like this, with the extra_allocation
- * items pointed at from fields within ios or per_dev:
-
- struct __alloc_all_io_state {
- struct ore_io_state ios;
- struct ore_per_dev_state per_dev[numdevs];
- union {
- struct osd_sg_entry sglist[sgs_per_dev * numdevs];
- struct page *pages[num_par_pages];
- } extra_allocation;
- } whole_allocation;
-
- */
-
- /* This should never happen, so abort early if it ever does. */
- if (sgs_per_dev && num_par_pages) {
- ORE_DBGMSG("Tried to use both pages and sglist\n");
- *pios = NULL;
- return -EINVAL;
- }
-
- if (numdevs > (INT_MAX - sizeof(*ios)) /
- sizeof(struct ore_per_dev_state))
- return -ENOMEM;
- size_ios = sizeof(*ios) + sizeof(struct ore_per_dev_state) * numdevs;
-
- if (sgs_per_dev * numdevs > INT_MAX / sizeof(struct osd_sg_entry))
- return -ENOMEM;
- if (num_par_pages > INT_MAX / sizeof(struct page *))
- return -ENOMEM;
- size_extra = max(sizeof(struct osd_sg_entry) * (sgs_per_dev * numdevs),
- sizeof(struct page *) * num_par_pages);
-
- size_total = size_ios + size_extra;
-
- if (likely(size_total <= PAGE_SIZE)) {
- ios = kzalloc(size_total, GFP_KERNEL);
- if (unlikely(!ios)) {
- ORE_DBGMSG("Failed kzalloc bytes=%zd\n", size_total);
- *pios = NULL;
- return -ENOMEM;
- }
- ios_extra = (char *)ios + size_ios;
- } else {
- ios = kzalloc(size_ios, GFP_KERNEL);
- if (unlikely(!ios)) {
- ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
- size_ios);
- *pios = NULL;
- return -ENOMEM;
- }
- ios_extra = kzalloc(size_extra, GFP_KERNEL);
- if (unlikely(!ios_extra)) {
- ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
- size_extra);
- kfree(ios);
- *pios = NULL;
- return -ENOMEM;
- }
-
- /* In this case the per_dev[0].sgilist holds the pointer to
- * be freed
- */
- ios->extra_part_alloc = true;
- }
-
- if (num_par_pages) {
- ios->parity_pages = ios_extra;
- ios->max_par_pages = num_par_pages;
- }
- if (sgs_per_dev) {
- struct osd_sg_entry *sgilist = ios_extra;
- unsigned d;
-
- for (d = 0; d < numdevs; ++d) {
- ios->per_dev[d].sglist = sgilist;
- sgilist += sgs_per_dev;
- }
- ios->sgs_per_dev = sgs_per_dev;
- }
-
- ios->layout = layout;
- ios->oc = oc;
- *pios = ios;
- return 0;
-}
-
-/* Allocate an io_state for only a single group of devices
- *
- * If a user needs to call ore_read/write() this version must be used becase it
- * allocates extra stuff for striping and raid.
- * The ore might decide to only IO less then @length bytes do to alignmets
- * and constrains as follows:
- * - The IO cannot cross group boundary.
- * - In raid5/6 The end of the IO must align at end of a stripe eg.
- * (@offset + @length) % strip_size == 0. Or the complete range is within a
- * single stripe.
- * - Memory condition only permitted a shorter IO. (A user can use @length=~0
- * And check the returned ios->length for max_io_size.)
- *
- * The caller must check returned ios->length (and/or ios->nr_pages) and
- * re-issue these pages that fall outside of ios->length
- */
-int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
- bool is_reading, u64 offset, u64 length,
- struct ore_io_state **pios)
-{
- struct ore_io_state *ios;
- unsigned numdevs = layout->group_width * layout->mirrors_p1;
- unsigned sgs_per_dev = 0, max_par_pages = 0;
- int ret;
-
- if (layout->parity && length) {
- unsigned data_devs = layout->group_width - layout->parity;
- unsigned stripe_size = layout->stripe_unit * data_devs;
- unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
- u32 remainder;
- u64 num_stripes;
- u64 num_raid_units;
-
- num_stripes = div_u64_rem(length, stripe_size, &remainder);
- if (remainder)
- ++num_stripes;
-
- num_raid_units = num_stripes * layout->parity;
-
- if (is_reading) {
- /* For reads add per_dev sglist array */
- /* TODO: Raid 6 we need twice more. Actually:
- * num_stripes / LCMdP(W,P);
- * if (W%P != 0) num_stripes *= parity;
- */
-
- /* first/last seg is split */
- num_raid_units += layout->group_width;
- sgs_per_dev = div_u64(num_raid_units, data_devs) + 2;
- } else {
- /* For Writes add parity pages array. */
- max_par_pages = num_raid_units * pages_in_unit *
- sizeof(struct page *);
- }
- }
-
- ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
- pios);
- if (unlikely(ret))
- return ret;
-
- ios = *pios;
- ios->reading = is_reading;
- ios->offset = offset;
-
- if (length) {
- ore_calc_stripe_info(layout, offset, length, &ios->si);
- ios->length = ios->si.length;
- ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
- ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
- if (layout->parity)
- _ore_post_alloc_raid_stuff(ios);
- }
-
- return 0;
-}
-EXPORT_SYMBOL(ore_get_rw_state);
-
-/* Allocate an io_state for all the devices in the comps array
- *
- * This version of io_state allocation is used mostly by create/remove
- * and trunc where we currently need all the devices. The only wastful
- * bit is the read/write_attributes with no IO. Those sites should
- * be converted to use ore_get_rw_state() with length=0
- */
-int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
- struct ore_io_state **pios)
-{
- return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
-}
-EXPORT_SYMBOL(ore_get_io_state);
-
-void ore_put_io_state(struct ore_io_state *ios)
-{
- if (ios) {
- unsigned i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[i];
-
- if (per_dev->or)
- osd_end_request(per_dev->or);
- if (per_dev->bio)
- bio_put(per_dev->bio);
- }
-
- _ore_free_raid_stuff(ios);
- kfree(ios);
- }
-}
-EXPORT_SYMBOL(ore_put_io_state);
-
-static void _sync_done(struct ore_io_state *ios, void *p)
-{
- struct completion *waiting = p;
-
- complete(waiting);
-}
-
-static void _last_io(struct kref *kref)
-{
- struct ore_io_state *ios = container_of(
- kref, struct ore_io_state, kref);
-
- ios->done(ios, ios->private);
-}
-
-static void _done_io(struct osd_request *or, void *p)
-{
- struct ore_io_state *ios = p;
-
- kref_put(&ios->kref, _last_io);
-}
-
-int ore_io_execute(struct ore_io_state *ios)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- bool sync = (ios->done == NULL);
- int i, ret;
-
- if (sync) {
- ios->done = _sync_done;
- ios->private = &wait;
- }
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_request *or = ios->per_dev[i].or;
- if (unlikely(!or))
- continue;
-
- ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL);
- if (unlikely(ret)) {
- ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
- ret);
- return ret;
- }
- }
-
- kref_init(&ios->kref);
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_request *or = ios->per_dev[i].or;
- if (unlikely(!or))
- continue;
-
- kref_get(&ios->kref);
- osd_execute_request_async(or, _done_io, ios);
- }
-
- kref_put(&ios->kref, _last_io);
- ret = 0;
-
- if (sync) {
- wait_for_completion(&wait);
- ret = ore_check_io(ios, NULL);
- }
- return ret;
-}
-
-static void _clear_bio(struct bio *bio)
-{
- struct bio_vec *bv;
- unsigned i;
- struct bvec_iter_all iter_all;
-
- bio_for_each_segment_all(bv, bio, i, iter_all) {
- unsigned this_count = bv->bv_len;
-
- if (likely(PAGE_SIZE == this_count))
- clear_highpage(bv->bv_page);
- else
- zero_user(bv->bv_page, bv->bv_offset, this_count);
- }
-}
-
-int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
-{
- enum osd_err_priority acumulated_osd_err = 0;
- int acumulated_lin_err = 0;
- int i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_sense_info osi;
- struct ore_per_dev_state *per_dev = &ios->per_dev[i];
- struct osd_request *or = per_dev->or;
- int ret;
-
- if (unlikely(!or))
- continue;
-
- ret = osd_req_decode_sense(or, &osi);
- if (likely(!ret))
- continue;
-
- if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
- per_dev->bio) {
- /* start read offset passed endof file.
- * Note: if we do not have bio it means read-attributes
- * In this case we should return error to caller.
- */
- _clear_bio(per_dev->bio);
- ORE_DBGMSG("start read offset passed end of file "
- "offset=0x%llx, length=0x%llx\n",
- _LLU(per_dev->offset),
- _LLU(per_dev->length));
-
- continue; /* we recovered */
- }
-
- if (on_dev_error) {
- u64 residual = ios->reading ?
- or->in.residual : or->out.residual;
- u64 offset = (ios->offset + ios->length) - residual;
- unsigned dev = per_dev->dev - ios->oc->first_dev;
- struct ore_dev *od = ios->oc->ods[dev];
-
- on_dev_error(ios, od, dev, osi.osd_err_pri,
- offset, residual);
- }
- if (osi.osd_err_pri >= acumulated_osd_err) {
- acumulated_osd_err = osi.osd_err_pri;
- acumulated_lin_err = ret;
- }
- }
-
- return acumulated_lin_err;
-}
-EXPORT_SYMBOL(ore_check_io);
-
-/*
- * L - logical offset into the file
- *
- * D - number of Data devices
- * D = group_width - parity
- *
- * U - The number of bytes in a stripe within a group
- * U = stripe_unit * D
- *
- * T - The number of bytes striped within a group of component objects
- * (before advancing to the next group)
- * T = U * group_depth
- *
- * S - The number of bytes striped across all component objects
- * before the pattern repeats
- * S = T * group_count
- *
- * M - The "major" (i.e., across all components) cycle number
- * M = L / S
- *
- * G - Counts the groups from the beginning of the major cycle
- * G = (L - (M * S)) / T [or (L % S) / T]
- *
- * H - The byte offset within the group
- * H = (L - (M * S)) % T [or (L % S) % T]
- *
- * N - The "minor" (i.e., across the group) stripe number
- * N = H / U
- *
- * C - The component index coresponding to L
- *
- * C = (H - (N * U)) / stripe_unit + G * D
- * [or (L % U) / stripe_unit + G * D]
- *
- * O - The component offset coresponding to L
- * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
- *
- * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
- * divide by parity
- * LCMdP = lcm(group_width, parity) / parity
- *
- * R - The parity Rotation stripe
- * (Note parity cycle always starts at a group's boundary)
- * R = N % LCMdP
- *
- * I = the first parity device index
- * I = (group_width + group_width - R*parity - parity) % group_width
- *
- * Craid - The component index Rotated
- * Craid = (group_width + C - R*parity) % group_width
- * (We add the group_width to avoid negative numbers modulo math)
- */
-void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
- u64 length, struct ore_striping_info *si)
-{
- u32 stripe_unit = layout->stripe_unit;
- u32 group_width = layout->group_width;
- u64 group_depth = layout->group_depth;
- u32 parity = layout->parity;
-
- u32 D = group_width - parity;
- u32 U = D * stripe_unit;
- u64 T = U * group_depth;
- u64 S = T * layout->group_count;
- u64 M = div64_u64(file_offset, S);
-
- /*
- G = (L - (M * S)) / T
- H = (L - (M * S)) % T
- */
- u64 LmodS = file_offset - M * S;
- u32 G = div64_u64(LmodS, T);
- u64 H = LmodS - G * T;
-
- u32 N = div_u64(H, U);
- u32 Nlast;
-
- /* "H - (N * U)" is just "H % U" so it's bound to u32 */
- u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
- u32 first_dev = C - C % group_width;
-
- div_u64_rem(file_offset, stripe_unit, &si->unit_off);
-
- si->obj_offset = si->unit_off + (N * stripe_unit) +
- (M * group_depth * stripe_unit);
- si->cur_comp = C - first_dev;
- si->cur_pg = si->unit_off / PAGE_SIZE;
-
- if (parity) {
- u32 LCMdP = lcm(group_width, parity) / parity;
- /* R = N % LCMdP; */
- u32 RxP = (N % LCMdP) * parity;
-
- si->par_dev = (group_width + group_width - parity - RxP) %
- group_width + first_dev;
- si->dev = (group_width + group_width + C - RxP) %
- group_width + first_dev;
- si->bytes_in_stripe = U;
- si->first_stripe_start = M * S + G * T + N * U;
- } else {
- /* Make the math correct see _prepare_one_group */
- si->par_dev = group_width;
- si->dev = C;
- }
-
- si->dev *= layout->mirrors_p1;
- si->par_dev *= layout->mirrors_p1;
- si->offset = file_offset;
- si->length = T - H;
- if (si->length > length)
- si->length = length;
-
- Nlast = div_u64(H + si->length + U - 1, U);
- si->maxdevUnits = Nlast - N;
-
- si->M = M;
-}
-EXPORT_SYMBOL(ore_calc_stripe_info);
-
-int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct page **pages,
- struct ore_per_dev_state *per_dev, int cur_len)
-{
- unsigned pg = *cur_pg;
- struct request_queue *q =
- osd_request_queue(_ios_od(ios, per_dev->dev));
- unsigned len = cur_len;
- int ret;
-
- if (per_dev->bio == NULL) {
- unsigned bio_size;
-
- if (!ios->reading) {
- bio_size = ios->si.maxdevUnits;
- } else {
- bio_size = (ios->si.maxdevUnits + 1) *
- (ios->layout->group_width - ios->layout->parity) /
- ios->layout->group_width;
- }
- bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
-
- per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
- if (unlikely(!per_dev->bio)) {
- ORE_DBGMSG("Failed to allocate BIO size=%u\n",
- bio_size);
- ret = -ENOMEM;
- goto out;
- }
- }
-
- while (cur_len > 0) {
- unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
- unsigned added_len;
-
- cur_len -= pglen;
-
- added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
- pglen, pgbase);
- if (unlikely(pglen != added_len)) {
- /* If bi_vcnt == bi_max then this is a SW BUG */
- ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
- "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
- per_dev->bio->bi_vcnt,
- per_dev->bio->bi_max_vecs,
- BIO_MAX_PAGES_KMALLOC, cur_len);
- ret = -ENOMEM;
- goto out;
- }
- _add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
-
- pgbase = 0;
- ++pg;
- }
- BUG_ON(cur_len);
-
- per_dev->length += len;
- *cur_pg = pg;
- ret = 0;
-out: /* we fail the complete unit on an error eg don't advance
- * per_dev->length and cur_pg. This means that we might have a bigger
- * bio than the CDB requested length (per_dev->length). That's fine
- * only the oposite is fatal.
- */
- return ret;
-}
-
-static int _add_parity_units(struct ore_io_state *ios,
- struct ore_striping_info *si,
- unsigned dev, unsigned first_dev,
- unsigned mirrors_p1, unsigned devs_in_group,
- unsigned cur_len)
-{
- unsigned do_parity;
- int ret = 0;
-
- for (do_parity = ios->layout->parity; do_parity; --do_parity) {
- struct ore_per_dev_state *per_dev;
-
- per_dev = &ios->per_dev[dev - first_dev];
- if (!per_dev->length && !per_dev->offset) {
- /* Only/always the parity unit of the first
- * stripe will be empty. So this is a chance to
- * initialize the per_dev info.
- */
- per_dev->dev = dev;
- per_dev->offset = si->obj_offset - si->unit_off;
- }
-
- ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
- do_parity == 1);
- if (unlikely(ret))
- break;
-
- if (do_parity != 1) {
- dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
- si->cur_comp = (si->cur_comp + 1) %
- ios->layout->group_width;
- }
- }
-
- return ret;
-}
-
-static int _prepare_for_striping(struct ore_io_state *ios)
-{
- struct ore_striping_info *si = &ios->si;
- unsigned stripe_unit = ios->layout->stripe_unit;
- unsigned mirrors_p1 = ios->layout->mirrors_p1;
- unsigned group_width = ios->layout->group_width;
- unsigned devs_in_group = group_width * mirrors_p1;
- unsigned dev = si->dev;
- unsigned first_dev = dev - (dev % devs_in_group);
- unsigned cur_pg = ios->pages_consumed;
- u64 length = ios->length;
- int ret = 0;
-
- if (!ios->pages) {
- ios->numdevs = ios->layout->mirrors_p1;
- return 0;
- }
-
- BUG_ON(length > si->length);
-
- while (length) {
- struct ore_per_dev_state *per_dev =
- &ios->per_dev[dev - first_dev];
- unsigned cur_len, page_off = 0;
-
- if (!per_dev->length && !per_dev->offset) {
- /* First time initialize the per_dev info. */
- per_dev->dev = dev;
- if (dev == si->dev) {
- WARN_ON(dev == si->par_dev);
- per_dev->offset = si->obj_offset;
- cur_len = stripe_unit - si->unit_off;
- page_off = si->unit_off & ~PAGE_MASK;
- BUG_ON(page_off && (page_off != ios->pgbase));
- } else {
- per_dev->offset = si->obj_offset - si->unit_off;
- cur_len = stripe_unit;
- }
- } else {
- cur_len = stripe_unit;
- }
- if (cur_len >= length)
- cur_len = length;
-
- ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
- per_dev, cur_len);
- if (unlikely(ret))
- goto out;
-
- length -= cur_len;
-
- dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
- si->cur_comp = (si->cur_comp + 1) % group_width;
- if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
- if (!length && ios->sp2d) {
- /* If we are writing and this is the very last
- * stripe. then operate on parity dev.
- */
- dev = si->par_dev;
- /* If last stripe operate on parity comp */
- si->cur_comp = group_width - ios->layout->parity;
- }
-
- /* In writes cur_len just means if it's the
- * last one. See _ore_add_parity_unit.
- */
- ret = _add_parity_units(ios, si, dev, first_dev,
- mirrors_p1, devs_in_group,
- ios->sp2d ? length : cur_len);
- if (unlikely(ret))
- goto out;
-
- /* Rotate next par_dev backwards with wraping */
- si->par_dev = (devs_in_group + si->par_dev -
- ios->layout->parity * mirrors_p1) %
- devs_in_group + first_dev;
- /* Next stripe, start fresh */
- si->cur_comp = 0;
- si->cur_pg = 0;
- si->obj_offset += cur_len;
- si->unit_off = 0;
- }
- }
-out:
- ios->numdevs = devs_in_group;
- ios->pages_consumed = cur_pg;
- return ret;
-}
-
-int ore_create(struct ore_io_state *ios)
-{
- int i, ret;
-
- for (i = 0; i < ios->oc->numdevs; i++) {
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, i));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- ios->per_dev[i].or = or;
- ios->numdevs++;
-
- osd_req_create_object(or, _ios_obj(ios, i));
- }
- ret = ore_io_execute(ios);
-
-out:
- return ret;
-}
-EXPORT_SYMBOL(ore_create);
-
-int ore_remove(struct ore_io_state *ios)
-{
- int i, ret;
-
- for (i = 0; i < ios->oc->numdevs; i++) {
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, i));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- ios->per_dev[i].or = or;
- ios->numdevs++;
-
- osd_req_remove_object(or, _ios_obj(ios, i));
- }
- ret = ore_io_execute(ios);
-
-out:
- return ret;
-}
-EXPORT_SYMBOL(ore_remove);
-
-static int _write_mirror(struct ore_io_state *ios, int cur_comp)
-{
- struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp];
- unsigned dev = ios->per_dev[cur_comp].dev;
- unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
- int ret = 0;
-
- if (ios->pages && !master_dev->length)
- return 0; /* Just an empty slot */
-
- for (; cur_comp < last_comp; ++cur_comp, ++dev) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, dev));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- per_dev->or = or;
-
- if (ios->pages) {
- struct bio *bio;
-
- if (per_dev != master_dev) {
- bio = bio_clone_fast(master_dev->bio,
- GFP_KERNEL, NULL);
- if (unlikely(!bio)) {
- ORE_DBGMSG(
- "Failed to allocate BIO size=%u\n",
- master_dev->bio->bi_max_vecs);
- ret = -ENOMEM;
- goto out;
- }
-
- bio->bi_disk = NULL;
- bio->bi_next = NULL;
- per_dev->offset = master_dev->offset;
- per_dev->length = master_dev->length;
- per_dev->bio = bio;
- per_dev->dev = dev;
- } else {
- bio = master_dev->bio;
- /* FIXME: bio_set_dir() */
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- }
-
- osd_req_write(or, _ios_obj(ios, cur_comp),
- per_dev->offset, bio, per_dev->length);
- ORE_DBGMSG("write(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d\n",
- _LLU(_ios_obj(ios, cur_comp)->id),
- _LLU(per_dev->offset),
- _LLU(per_dev->length), dev);
- } else if (ios->kern_buff) {
- per_dev->offset = ios->si.obj_offset;
- per_dev->dev = ios->si.dev + dev;
-
- /* no cross device without page array */
- BUG_ON((ios->layout->group_width > 1) &&
- (ios->si.unit_off + ios->length >
- ios->layout->stripe_unit));
-
- ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp),
- per_dev->offset,
- ios->kern_buff, ios->length);
- if (unlikely(ret))
- goto out;
- ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d\n",
- _LLU(_ios_obj(ios, cur_comp)->id),
- _LLU(per_dev->offset),
- _LLU(ios->length), per_dev->dev);
- } else {
- osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
- ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
- _LLU(_ios_obj(ios, cur_comp)->id),
- ios->out_attr_len, dev);
- }
-
- if (ios->out_attr)
- osd_req_add_set_attr_list(or, ios->out_attr,
- ios->out_attr_len);
-
- if (ios->in_attr)
- osd_req_add_get_attr_list(or, ios->in_attr,
- ios->in_attr_len);
- }
-
-out:
- return ret;
-}
-
-int ore_write(struct ore_io_state *ios)
-{
- int i;
- int ret;
-
- if (unlikely(ios->sp2d && !ios->r4w)) {
- /* A library is attempting a RAID-write without providing
- * a pages lock interface.
- */
- WARN_ON_ONCE(1);
- return -ENOTSUPP;
- }
-
- ret = _prepare_for_striping(ios);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- ret = _write_mirror(ios, i);
- if (unlikely(ret))
- return ret;
- }
-
- ret = ore_io_execute(ios);
- return ret;
-}
-EXPORT_SYMBOL(ore_write);
-
-int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
-{
- struct osd_request *or;
- struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- struct osd_obj_id *obj = _ios_obj(ios, cur_comp);
- unsigned first_dev = (unsigned)obj->id;
-
- if (ios->pages && !per_dev->length)
- return 0; /* Just an empty slot */
-
- first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
- or = osd_start_request(_ios_od(ios, first_dev));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- return -ENOMEM;
- }
- per_dev->or = or;
-
- if (ios->pages) {
- if (per_dev->cur_sg) {
- /* finalize the last sg_entry */
- _ore_add_sg_seg(per_dev, 0, false);
- if (unlikely(!per_dev->cur_sg))
- return 0; /* Skip parity only device */
-
- osd_req_read_sg(or, obj, per_dev->bio,
- per_dev->sglist, per_dev->cur_sg);
- } else {
- /* The no raid case */
- osd_req_read(or, obj, per_dev->offset,
- per_dev->bio, per_dev->length);
- }
-
- ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
- " dev=%d sg_len=%d\n", _LLU(obj->id),
- _LLU(per_dev->offset), _LLU(per_dev->length),
- first_dev, per_dev->cur_sg);
- } else {
- BUG_ON(ios->kern_buff);
-
- osd_req_get_attributes(or, obj);
- ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
- _LLU(obj->id),
- ios->in_attr_len, first_dev);
- }
- if (ios->out_attr)
- osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
-
- if (ios->in_attr)
- osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
-
- return 0;
-}
-
-int ore_read(struct ore_io_state *ios)
-{
- int i;
- int ret;
-
- ret = _prepare_for_striping(ios);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- ret = _ore_read_mirror(ios, i);
- if (unlikely(ret))
- return ret;
- }
-
- ret = ore_io_execute(ios);
- return ret;
-}
-EXPORT_SYMBOL(ore_read);
-
-int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr)
-{
- struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
- void *iter = NULL;
- int nelem;
-
- do {
- nelem = 1;
- osd_req_decode_get_attr_list(ios->per_dev[0].or,
- &cur_attr, &nelem, &iter);
- if ((cur_attr.attr_page == attr->attr_page) &&
- (cur_attr.attr_id == attr->attr_id)) {
- attr->len = cur_attr.len;
- attr->val_ptr = cur_attr.val_ptr;
- return 0;
- }
- } while (iter);
-
- return -EIO;
-}
-EXPORT_SYMBOL(extract_attr_from_ios);
-
-static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
- struct osd_attr *attr)
-{
- int last_comp = cur_comp + ios->layout->mirrors_p1;
-
- for (; cur_comp < last_comp; ++cur_comp) {
- struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- struct osd_request *or;
-
- or = osd_start_request(_ios_od(ios, cur_comp));
- if (unlikely(!or)) {
- ORE_ERR("%s: osd_start_request failed\n", __func__);
- return -ENOMEM;
- }
- per_dev->or = or;
-
- osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
- osd_req_add_set_attr_list(or, attr, 1);
- }
-
- return 0;
-}
-
-struct _trunc_info {
- struct ore_striping_info si;
- u64 prev_group_obj_off;
- u64 next_group_obj_off;
-
- unsigned first_group_dev;
- unsigned nex_group_dev;
-};
-
-static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
- struct _trunc_info *ti)
-{
- unsigned stripe_unit = layout->stripe_unit;
-
- ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
-
- ti->prev_group_obj_off = ti->si.M * stripe_unit;
- ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
-
- ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
- ti->nex_group_dev = ti->first_group_dev + layout->group_width;
-}
-
-int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
- u64 size)
-{
- struct ore_io_state *ios;
- struct exofs_trunc_attr {
- struct osd_attr attr;
- __be64 newsize;
- } *size_attrs;
- struct _trunc_info ti;
- int i, ret;
-
- ret = ore_get_io_state(layout, oc, &ios);
- if (unlikely(ret))
- return ret;
-
- _calc_trunk_info(ios->layout, size, &ti);
-
- size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
- GFP_KERNEL);
- if (unlikely(!size_attrs)) {
- ret = -ENOMEM;
- goto out;
- }
-
- ios->numdevs = ios->oc->numdevs;
-
- for (i = 0; i < ios->numdevs; ++i) {
- struct exofs_trunc_attr *size_attr = &size_attrs[i];
- u64 obj_size;
-
- if (i < ti.first_group_dev)
- obj_size = ti.prev_group_obj_off;
- else if (i >= ti.nex_group_dev)
- obj_size = ti.next_group_obj_off;
- else if (i < ti.si.dev) /* dev within this group */
- obj_size = ti.si.obj_offset +
- ios->layout->stripe_unit - ti.si.unit_off;
- else if (i == ti.si.dev)
- obj_size = ti.si.obj_offset;
- else /* i > ti.dev */
- obj_size = ti.si.obj_offset - ti.si.unit_off;
-
- size_attr->newsize = cpu_to_be64(obj_size);
- size_attr->attr = g_attr_logical_length;
- size_attr->attr.val_ptr = &size_attr->newsize;
-
- ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
- _LLU(oc->comps->obj.id), _LLU(obj_size), i);
- ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
- &size_attr->attr);
- if (unlikely(ret))
- goto out;
- }
- ret = ore_io_execute(ios);
-
-out:
- kfree(size_attrs);
- ore_put_io_state(ios);
- return ret;
-}
-EXPORT_SYMBOL(ore_truncate);
-
-const struct osd_attr g_attr_logical_length = ATTR_DEF(
- OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
-EXPORT_SYMBOL(g_attr_logical_length);
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
deleted file mode 100644
index e83bab54b03e..000000000000
--- a/fs/exofs/ore_raid.c
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Copyright (C) 2011
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of the objects raid engine (ore).
- *
- * It is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with "ore". If not, write to the Free Software Foundation, Inc:
- * "Free Software Foundation <info@fsf.org>"
- */
-
-#include <linux/gfp.h>
-#include <linux/async_tx.h>
-
-#include "ore_raid.h"
-
-#undef ORE_DBGMSG2
-#define ORE_DBGMSG2 ORE_DBGMSG
-
-static struct page *_raid_page_alloc(void)
-{
- return alloc_page(GFP_KERNEL);
-}
-
-static void _raid_page_free(struct page *p)
-{
- __free_page(p);
-}
-
-/* This struct is forward declare in ore_io_state, but is private to here.
- * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
- *
- * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
- * Ascending page index access is sp2d(p-minor, c-major). But storage is
- * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
- * API.
- */
-struct __stripe_pages_2d {
- /* Cache some hot path repeated calculations */
- unsigned parity;
- unsigned data_devs;
- unsigned pages_in_unit;
-
- bool needed ;
-
- /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
- struct __1_page_stripe {
- bool alloc;
- unsigned write_count;
- struct async_submit_ctl submit;
- struct dma_async_tx_descriptor *tx;
-
- /* The size of this array is data_devs + parity */
- struct page **pages;
- struct page **scribble;
- /* bool array, size of this array is data_devs */
- char *page_is_read;
- } _1p_stripes[];
-};
-
-/* This can get bigger then a page. So support multiple page allocations
- * _sp2d_free should be called even if _sp2d_alloc fails (by returning
- * none-zero).
- */
-static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
- unsigned parity, struct __stripe_pages_2d **psp2d)
-{
- struct __stripe_pages_2d *sp2d;
- unsigned data_devs = group_width - parity;
-
- /*
- * Desired allocation layout is, though when larger than PAGE_SIZE,
- * each struct __alloc_1p_arrays is separately allocated:
-
- struct _alloc_all_bytes {
- struct __alloc_stripe_pages_2d {
- struct __stripe_pages_2d sp2d;
- struct __1_page_stripe _1p_stripes[pages_in_unit];
- } __asp2d;
- struct __alloc_1p_arrays {
- struct page *pages[group_width];
- struct page *scribble[group_width];
- char page_is_read[data_devs];
- } __a1pa[pages_in_unit];
- } *_aab;
-
- struct __alloc_1p_arrays *__a1pa;
- struct __alloc_1p_arrays *__a1pa_end;
-
- */
-
- char *__a1pa;
- char *__a1pa_end;
-
- const size_t sizeof_stripe_pages_2d =
- sizeof(struct __stripe_pages_2d) +
- sizeof(struct __1_page_stripe) * pages_in_unit;
- const size_t sizeof__a1pa =
- ALIGN(sizeof(struct page *) * (2 * group_width) + data_devs,
- sizeof(void *));
- const size_t sizeof__a1pa_arrays = sizeof__a1pa * pages_in_unit;
- const size_t alloc_total = sizeof_stripe_pages_2d +
- sizeof__a1pa_arrays;
-
- unsigned num_a1pa, alloc_size, i;
-
- /* FIXME: check these numbers in ore_verify_layout */
- BUG_ON(sizeof_stripe_pages_2d > PAGE_SIZE);
- BUG_ON(sizeof__a1pa > PAGE_SIZE);
-
- /*
- * If alloc_total would be larger than PAGE_SIZE, only allocate
- * as many a1pa items as would fill the rest of the page, instead
- * of the full pages_in_unit count.
- */
- if (alloc_total > PAGE_SIZE) {
- num_a1pa = (PAGE_SIZE - sizeof_stripe_pages_2d) / sizeof__a1pa;
- alloc_size = sizeof_stripe_pages_2d + sizeof__a1pa * num_a1pa;
- } else {
- num_a1pa = pages_in_unit;
- alloc_size = alloc_total;
- }
-
- *psp2d = sp2d = kzalloc(alloc_size, GFP_KERNEL);
- if (unlikely(!sp2d)) {
- ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
- return -ENOMEM;
- }
- /* From here Just call _sp2d_free */
-
- /* Find start of a1pa area. */
- __a1pa = (char *)sp2d + sizeof_stripe_pages_2d;
- /* Find end of the _allocated_ a1pa area. */
- __a1pa_end = __a1pa + alloc_size;
-
- /* Allocate additionally needed a1pa items in PAGE_SIZE chunks. */
- for (i = 0; i < pages_in_unit; ++i) {
- struct __1_page_stripe *stripe = &sp2d->_1p_stripes[i];
-
- if (unlikely(__a1pa >= __a1pa_end)) {
- num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
- pages_in_unit - i);
- alloc_size = sizeof__a1pa * num_a1pa;
-
- __a1pa = kzalloc(alloc_size, GFP_KERNEL);
- if (unlikely(!__a1pa)) {
- ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
- num_a1pa);
- return -ENOMEM;
- }
- __a1pa_end = __a1pa + alloc_size;
- /* First *pages is marked for kfree of the buffer */
- stripe->alloc = true;
- }
-
- /*
- * Attach all _lp_stripes pointers to the allocation for
- * it which was either part of the original PAGE_SIZE
- * allocation or the subsequent allocation in this loop.
- */
- stripe->pages = (void *)__a1pa;
- stripe->scribble = stripe->pages + group_width;
- stripe->page_is_read = (char *)stripe->scribble + group_width;
- __a1pa += sizeof__a1pa;
- }
-
- sp2d->parity = parity;
- sp2d->data_devs = data_devs;
- sp2d->pages_in_unit = pages_in_unit;
- return 0;
-}
-
-static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
- const struct _ore_r4w_op *r4w, void *priv)
-{
- unsigned data_devs = sp2d->data_devs;
- unsigned group_width = data_devs + sp2d->parity;
- int p, c;
-
- if (!sp2d->needed)
- return;
-
- for (c = data_devs - 1; c >= 0; --c)
- for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (_1ps->page_is_read[c]) {
- struct page *page = _1ps->pages[c];
-
- r4w->put_page(priv, page);
- _1ps->page_is_read[c] = false;
- }
- }
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
- _1ps->write_count = 0;
- _1ps->tx = NULL;
- }
-
- sp2d->needed = false;
-}
-
-static void _sp2d_free(struct __stripe_pages_2d *sp2d)
-{
- unsigned i;
-
- if (!sp2d)
- return;
-
- for (i = 0; i < sp2d->pages_in_unit; ++i) {
- if (sp2d->_1p_stripes[i].alloc)
- kfree(sp2d->_1p_stripes[i].pages);
- }
-
- kfree(sp2d);
-}
-
-static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
-{
- unsigned p;
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (_1ps->write_count)
- return p;
- }
-
- return ~0;
-}
-
-static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
-{
- int p;
-
- for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (_1ps->write_count)
- return p;
- }
-
- return ~0;
-}
-
-static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
-{
- unsigned p;
- unsigned tx_flags = ASYNC_TX_ACK;
-
- if (sp2d->parity == 1)
- tx_flags |= ASYNC_TX_XOR_ZERO_DST;
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (!_1ps->write_count)
- continue;
-
- init_async_submit(&_1ps->submit, tx_flags,
- NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
-
- if (sp2d->parity == 1)
- _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
- _1ps->pages, 0, sp2d->data_devs,
- PAGE_SIZE, &_1ps->submit);
- else /* parity == 2 */
- _1ps->tx = async_gen_syndrome(_1ps->pages, 0,
- sp2d->data_devs + sp2d->parity,
- PAGE_SIZE, &_1ps->submit);
- }
-
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
- /* NOTE: We wait for HW synchronously (I don't have such HW
- * to test with.) Is parallelism needed with today's multi
- * cores?
- */
- async_tx_issue_pending(_1ps->tx);
- }
-}
-
-void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
- struct ore_striping_info *si, struct page *page)
-{
- struct __1_page_stripe *_1ps;
-
- sp2d->needed = true;
-
- _1ps = &sp2d->_1p_stripes[si->cur_pg];
- _1ps->pages[si->cur_comp] = page;
- ++_1ps->write_count;
-
- si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
- /* si->cur_comp is advanced outside at main loop */
-}
-
-void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
- bool not_last)
-{
- struct osd_sg_entry *sge;
-
- ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
- "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
- per_dev->dev, cur_len, not_last, per_dev->cur_sg,
- _LLU(per_dev->offset), per_dev->length,
- per_dev->last_sgs_total);
-
- if (!per_dev->cur_sg) {
- sge = per_dev->sglist;
-
- /* First time we prepare two entries */
- if (per_dev->length) {
- ++per_dev->cur_sg;
- sge->offset = per_dev->offset;
- sge->len = per_dev->length;
- } else {
- /* Here the parity is the first unit of this object.
- * This happens every time we reach a parity device on
- * the same stripe as the per_dev->offset. We need to
- * just skip this unit.
- */
- per_dev->offset += cur_len;
- return;
- }
- } else {
- /* finalize the last one */
- sge = &per_dev->sglist[per_dev->cur_sg - 1];
- sge->len = per_dev->length - per_dev->last_sgs_total;
- }
-
- if (not_last) {
- /* Partly prepare the next one */
- struct osd_sg_entry *next_sge = sge + 1;
-
- ++per_dev->cur_sg;
- next_sge->offset = sge->offset + sge->len + cur_len;
- /* Save cur len so we know how mutch was added next time */
- per_dev->last_sgs_total = per_dev->length;
- next_sge->len = 0;
- } else if (!sge->len) {
- /* Optimize for when the last unit is a parity */
- --per_dev->cur_sg;
- }
-}
-
-static int _alloc_read_4_write(struct ore_io_state *ios)
-{
- struct ore_layout *layout = ios->layout;
- int ret;
- /* We want to only read those pages not in cache so worst case
- * is a stripe populated with every other page
- */
- unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
-
- ret = _ore_get_io_state(layout, ios->oc,
- layout->group_width * layout->mirrors_p1,
- sgs_per_dev, 0, &ios->ios_read_4_write);
- return ret;
-}
-
-/* @si contains info of the to-be-inserted page. Update of @si should be
- * maintained by caller. Specificaly si->dev, si->obj_offset, ...
- */
-static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si,
- struct page *page, unsigned pg_len)
-{
- struct request_queue *q;
- struct ore_per_dev_state *per_dev;
- struct ore_io_state *read_ios;
- unsigned first_dev = si->dev - (si->dev %
- (ios->layout->group_width * ios->layout->mirrors_p1));
- unsigned comp = si->dev - first_dev;
- unsigned added_len;
-
- if (!ios->ios_read_4_write) {
- int ret = _alloc_read_4_write(ios);
-
- if (unlikely(ret))
- return ret;
- }
-
- read_ios = ios->ios_read_4_write;
- read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
-
- per_dev = &read_ios->per_dev[comp];
- if (!per_dev->length) {
- per_dev->bio = bio_kmalloc(GFP_KERNEL,
- ios->sp2d->pages_in_unit);
- if (unlikely(!per_dev->bio)) {
- ORE_DBGMSG("Failed to allocate BIO size=%u\n",
- ios->sp2d->pages_in_unit);
- return -ENOMEM;
- }
- per_dev->offset = si->obj_offset;
- per_dev->dev = si->dev;
- } else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
- u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
-
- _ore_add_sg_seg(per_dev, gap, true);
- }
- q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
- added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len,
- si->obj_offset % PAGE_SIZE);
- if (unlikely(added_len != pg_len)) {
- ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
- per_dev->bio->bi_vcnt);
- return -ENOMEM;
- }
-
- per_dev->length += pg_len;
- return 0;
-}
-
-/* read the beginning of an unaligned first page */
-static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page)
-{
- struct ore_striping_info si;
- unsigned pg_len;
-
- ore_calc_stripe_info(ios->layout, ios->offset, 0, &si);
-
- pg_len = si.obj_offset % PAGE_SIZE;
- si.obj_offset -= pg_len;
-
- ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
- _LLU(si.obj_offset), pg_len, page->index, si.dev);
-
- return _add_to_r4w(ios, &si, page, pg_len);
-}
-
-/* read the end of an incomplete last page */
-static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
-{
- struct ore_striping_info si;
- struct page *page;
- unsigned pg_len, p, c;
-
- ore_calc_stripe_info(ios->layout, *offset, 0, &si);
-
- p = si.cur_pg;
- c = si.cur_comp;
- page = ios->sp2d->_1p_stripes[p].pages[c];
-
- pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
- *offset += pg_len;
-
- ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
- p, c, _LLU(*offset), pg_len, si.dev, si.par_dev);
-
- BUG_ON(!page);
-
- return _add_to_r4w(ios, &si, page, pg_len);
-}
-
-static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
-{
- struct bio_vec *bv;
- unsigned i, d;
-
- /* loop on all devices all pages */
- for (d = 0; d < ios->numdevs; d++) {
- struct bio *bio = ios->per_dev[d].bio;
- struct bvec_iter_all iter_all;
-
- if (!bio)
- continue;
-
- bio_for_each_segment_all(bv, bio, i, iter_all) {
- struct page *page = bv->bv_page;
-
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
- }
- }
-}
-
-/* read_4_write is hacked to read the start of the first stripe and/or
- * the end of the last stripe. If needed, with an sg-gap at each device/page.
- * It is assumed to be called after the to_be_written pages of the first stripe
- * are populating ios->sp2d[][]
- *
- * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
- * These pages are held at sp2d[p].pages[c] but with
- * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
- * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
- * @uptodate=true, so we don't need to read it, only unlock, after IO.
- *
- * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
- * to-be-written count, we should consider the xor-in-place mode.
- * need_to_read_pages_count is the actual number of pages not present in cache.
- * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
- * approximation? In this mode the read pages are put in the empty places of
- * ios->sp2d[p][*], xor is calculated the same way. These pages are
- * allocated/freed and don't go through cache
- */
-static int _read_4_write_first_stripe(struct ore_io_state *ios)
-{
- struct ore_striping_info read_si;
- struct __stripe_pages_2d *sp2d = ios->sp2d;
- u64 offset = ios->si.first_stripe_start;
- unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
-
- if (offset == ios->offset) /* Go to start collect $200 */
- goto read_last_stripe;
-
- min_p = _sp2d_min_pg(sp2d);
- max_p = _sp2d_max_pg(sp2d);
-
- ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n",
- offset, ios->offset, min_p, max_p);
-
- for (c = 0; ; c++) {
- ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- read_si.obj_offset += min_p * PAGE_SIZE;
- offset += min_p * PAGE_SIZE;
- for (p = min_p; p <= max_p; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
- struct page **pp = &_1ps->pages[c];
- bool uptodate;
-
- if (*pp) {
- if (ios->offset % PAGE_SIZE)
- /* Read the remainder of the page */
- _add_to_r4w_first_page(ios, *pp);
- /* to-be-written pages start here */
- goto read_last_stripe;
- }
-
- *pp = ios->r4w->get_page(ios->private, offset,
- &uptodate);
- if (unlikely(!*pp))
- return -ENOMEM;
-
- if (!uptodate)
- _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE);
-
- /* Mark read-pages to be cache_released */
- _1ps->page_is_read[c] = true;
- read_si.obj_offset += PAGE_SIZE;
- offset += PAGE_SIZE;
- }
- offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
- }
-
-read_last_stripe:
- return 0;
-}
-
-static int _read_4_write_last_stripe(struct ore_io_state *ios)
-{
- struct ore_striping_info read_si;
- struct __stripe_pages_2d *sp2d = ios->sp2d;
- u64 offset;
- u64 last_stripe_end;
- unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
- unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
-
- offset = ios->offset + ios->length;
- if (offset % PAGE_SIZE)
- _add_to_r4w_last_page(ios, &offset);
- /* offset will be aligned to next page */
-
- last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
- * bytes_in_stripe;
- if (offset == last_stripe_end) /* Optimize for the aligned case */
- goto read_it;
-
- ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- p = read_si.cur_pg;
- c = read_si.cur_comp;
-
- if (min_p == sp2d->pages_in_unit) {
- /* Didn't do it yet */
- min_p = _sp2d_min_pg(sp2d);
- max_p = _sp2d_max_pg(sp2d);
- }
-
- ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n",
- offset, last_stripe_end, min_p, max_p);
-
- while (offset < last_stripe_end) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if ((min_p <= p) && (p <= max_p)) {
- struct page *page;
- bool uptodate;
-
- BUG_ON(_1ps->pages[c]);
- page = ios->r4w->get_page(ios->private, offset,
- &uptodate);
- if (unlikely(!page))
- return -ENOMEM;
-
- _1ps->pages[c] = page;
- /* Mark read-pages to be cache_released */
- _1ps->page_is_read[c] = true;
- if (!uptodate)
- _add_to_r4w(ios, &read_si, page, PAGE_SIZE);
- }
-
- offset += PAGE_SIZE;
- if (p == (sp2d->pages_in_unit - 1)) {
- ++c;
- p = 0;
- ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- } else {
- read_si.obj_offset += PAGE_SIZE;
- ++p;
- }
- }
-
-read_it:
- return 0;
-}
-
-static int _read_4_write_execute(struct ore_io_state *ios)
-{
- struct ore_io_state *ios_read;
- unsigned i;
- int ret;
-
- ios_read = ios->ios_read_4_write;
- if (!ios_read)
- return 0;
-
- /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
- * to check for per_dev->bio
- */
- ios_read->pages = ios->pages;
-
- /* Now read these devices */
- for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
- ret = _ore_read_mirror(ios_read, i);
- if (unlikely(ret))
- return ret;
- }
-
- ret = ore_io_execute(ios_read); /* Synchronus execution */
- if (unlikely(ret)) {
- ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
- return ret;
- }
-
- _mark_read4write_pages_uptodate(ios_read, ret);
- ore_put_io_state(ios_read);
- ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */
- return 0;
-}
-
-/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
-int _ore_add_parity_unit(struct ore_io_state *ios,
- struct ore_striping_info *si,
- struct ore_per_dev_state *per_dev,
- unsigned cur_len, bool do_xor)
-{
- if (ios->reading) {
- if (per_dev->cur_sg >= ios->sgs_per_dev) {
- ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
- per_dev->cur_sg, ios->sgs_per_dev);
- return -ENOMEM;
- }
- _ore_add_sg_seg(per_dev, cur_len, true);
- } else {
- struct __stripe_pages_2d *sp2d = ios->sp2d;
- struct page **pages = ios->parity_pages + ios->cur_par_page;
- unsigned num_pages;
- unsigned array_start = 0;
- unsigned i;
- int ret;
-
- si->cur_pg = _sp2d_min_pg(sp2d);
- num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
-
- if (!per_dev->length) {
- per_dev->offset += si->cur_pg * PAGE_SIZE;
- /* If first stripe, Read in all read4write pages
- * (if needed) before we calculate the first parity.
- */
- if (do_xor)
- _read_4_write_first_stripe(ios);
- }
- if (!cur_len && do_xor)
- /* If last stripe r4w pages of last stripe */
- _read_4_write_last_stripe(ios);
- _read_4_write_execute(ios);
-
- for (i = 0; i < num_pages; i++) {
- pages[i] = _raid_page_alloc();
- if (unlikely(!pages[i]))
- return -ENOMEM;
-
- ++(ios->cur_par_page);
- }
-
- BUG_ON(si->cur_comp < sp2d->data_devs);
- BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
-
- ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
- per_dev, num_pages * PAGE_SIZE);
- if (unlikely(ret))
- return ret;
-
- if (do_xor) {
- _gen_xor_unit(sp2d);
- _sp2d_reset(sp2d, ios->r4w, ios->private);
- }
- }
- return 0;
-}
-
-int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
-{
- if (ios->parity_pages) {
- struct ore_layout *layout = ios->layout;
- unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
-
- if (_sp2d_alloc(pages_in_unit, layout->group_width,
- layout->parity, &ios->sp2d)) {
- return -ENOMEM;
- }
- }
- return 0;
-}
-
-void _ore_free_raid_stuff(struct ore_io_state *ios)
-{
- if (ios->sp2d) { /* writing and raid */
- unsigned i;
-
- for (i = 0; i < ios->cur_par_page; i++) {
- struct page *page = ios->parity_pages[i];
-
- if (page)
- _raid_page_free(page);
- }
- if (ios->extra_part_alloc)
- kfree(ios->parity_pages);
- /* If IO returned an error pages might need unlocking */
- _sp2d_reset(ios->sp2d, ios->r4w, ios->private);
- _sp2d_free(ios->sp2d);
- } else {
- /* Will only be set if raid reading && sglist is big */
- if (ios->extra_part_alloc)
- kfree(ios->per_dev[0].sglist);
- }
- if (ios->ios_read_4_write)
- ore_put_io_state(ios->ios_read_4_write);
-}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
deleted file mode 100644
index a6e746775570..000000000000
--- a/fs/exofs/ore_raid.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (C) from 2011
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of the objects raid engine (ore).
- *
- * It is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * You should have received a copy of the GNU General Public License
- * along with "ore". If not, write to the Free Software Foundation, Inc:
- * "Free Software Foundation <info@fsf.org>"
- */
-
-#include <scsi/osd_ore.h>
-
-#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
-
-#ifdef CONFIG_EXOFS_DEBUG
-#define ORE_DBGMSG(fmt, a...) \
- printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define ORE_DBGMSG(fmt, a...) \
- do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
-
-#define ORE_DBGMSG2(M...) do {} while (0)
-/* #define ORE_DBGMSG2 ORE_DBGMSG */
-
-/* ios_raid.c stuff needed by ios.c */
-int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
-void _ore_free_raid_stuff(struct ore_io_state *ios);
-
-void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
- bool not_last);
-int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
- struct ore_per_dev_state *per_dev, unsigned cur_len,
- bool do_xor);
-void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
- struct ore_striping_info *si, struct page *page);
-static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
- struct ore_striping_info *si, struct page *page)
-{
- if (!sp2d) /* Inline the fast path */
- return; /* Hay no raid stuff */
- _ore_add_stripe_page(sp2d, si, page);
-}
-
-/* ios.c stuff needed by ios_raid.c */
-int _ore_get_io_state(struct ore_layout *layout,
- struct ore_components *oc, unsigned numdevs,
- unsigned sgs_per_dev, unsigned num_par_pages,
- struct ore_io_state **pios);
-int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct page **pages,
- struct ore_per_dev_state *per_dev, int cur_len);
-int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
-int ore_io_execute(struct ore_io_state *ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
deleted file mode 100644
index fc80c7233fa5..000000000000
--- a/fs/exofs/super.c
+++ /dev/null
@@ -1,1071 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/string.h>
-#include <linux/parser.h>
-#include <linux/vfs.h>
-#include <linux/random.h>
-#include <linux/module.h>
-#include <linux/exportfs.h>
-#include <linux/slab.h>
-#include <linux/iversion.h>
-
-#include "exofs.h"
-
-#define EXOFS_DBGMSG2(M...) do {} while (0)
-
-/******************************************************************************
- * MOUNT OPTIONS
- *****************************************************************************/
-
-/*
- * struct to hold what we get from mount options
- */
-struct exofs_mountopt {
- bool is_osdname;
- const char *dev_name;
- uint64_t pid;
- int timeout;
-};
-
-/*
- * exofs-specific mount-time options.
- */
-enum { Opt_name, Opt_pid, Opt_to, Opt_err };
-
-/*
- * Our mount-time options. These should ideally be 64-bit unsigned, but the
- * kernel's parsing functions do not currently support that. 32-bit should be
- * sufficient for most applications now.
- */
-static match_table_t tokens = {
- {Opt_name, "osdname=%s"},
- {Opt_pid, "pid=%u"},
- {Opt_to, "to=%u"},
- {Opt_err, NULL}
-};
-
-/*
- * The main option parsing method. Also makes sure that all of the mandatory
- * mount options were set.
- */
-static int parse_options(char *options, struct exofs_mountopt *opts)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- bool s_pid = false;
-
- EXOFS_DBGMSG("parse_options %s\n", options);
- /* defaults */
- memset(opts, 0, sizeof(*opts));
- opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- char str[32];
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_name:
- kfree(opts->dev_name);
- opts->dev_name = match_strdup(&args[0]);
- if (unlikely(!opts->dev_name)) {
- EXOFS_ERR("Error allocating dev_name");
- return -ENOMEM;
- }
- opts->is_osdname = true;
- break;
- case Opt_pid:
- if (0 == match_strlcpy(str, &args[0], sizeof(str)))
- return -EINVAL;
- opts->pid = simple_strtoull(str, NULL, 0);
- if (opts->pid < EXOFS_MIN_PID) {
- EXOFS_ERR("Partition ID must be >= %u",
- EXOFS_MIN_PID);
- return -EINVAL;
- }
- s_pid = true;
- break;
- case Opt_to:
- if (match_int(&args[0], &option))
- return -EINVAL;
- if (option <= 0) {
- EXOFS_ERR("Timeout must be > 0");
- return -EINVAL;
- }
- opts->timeout = option * HZ;
- break;
- }
- }
-
- if (!s_pid) {
- EXOFS_ERR("Need to specify the following options:\n");
- EXOFS_ERR(" -o pid=pid_no_to_use\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-/******************************************************************************
- * INODE CACHE
- *****************************************************************************/
-
-/*
- * Our inode cache. Isn't it pretty?
- */
-static struct kmem_cache *exofs_inode_cachep;
-
-/*
- * Allocate an inode in the cache
- */
-static struct inode *exofs_alloc_inode(struct super_block *sb)
-{
- struct exofs_i_info *oi;
-
- oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
- if (!oi)
- return NULL;
-
- inode_set_iversion(&oi->vfs_inode, 1);
- return &oi->vfs_inode;
-}
-
-static void exofs_i_callback(struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
-}
-
-/*
- * Remove an inode from the cache
- */
-static void exofs_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, exofs_i_callback);
-}
-
-/*
- * Initialize the inode
- */
-static void exofs_init_once(void *foo)
-{
- struct exofs_i_info *oi = foo;
-
- inode_init_once(&oi->vfs_inode);
-}
-
-/*
- * Create and initialize the inode cache
- */
-static int init_inodecache(void)
-{
- exofs_inode_cachep = kmem_cache_create_usercopy("exofs_inode_cache",
- sizeof(struct exofs_i_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
- SLAB_ACCOUNT,
- offsetof(struct exofs_i_info, i_data),
- sizeof_field(struct exofs_i_info, i_data),
- exofs_init_once);
- if (exofs_inode_cachep == NULL)
- return -ENOMEM;
- return 0;
-}
-
-/*
- * Destroy the inode cache
- */
-static void destroy_inodecache(void)
-{
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(exofs_inode_cachep);
-}
-
-/******************************************************************************
- * Some osd helpers
- *****************************************************************************/
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
-{
- osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-
-static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
- u64 offset, void *p, unsigned length)
-{
- struct osd_request *or = osd_start_request(od);
-/* struct osd_sense_info osi = {.key = 0};*/
- int ret;
-
- if (unlikely(!or)) {
- EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
- return -ENOMEM;
- }
- ret = osd_req_read_kern(or, obj, offset, p, length);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
- goto out;
- }
-
- ret = osd_finalize_request(or, 0, cred, NULL);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
- goto out;
- }
-
- ret = osd_execute_request(or);
- if (unlikely(ret))
- EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
- /* osd_req_decode_sense(or, ret); */
-
-out:
- osd_end_request(or);
- EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%p ret=>%d\n",
- _LLU(obj->id), _LLU(offset), _LLU(length), od, ret);
- return ret;
-}
-
-static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
- EXOFS_APAGE_SB_DATA,
- EXOFS_ATTR_SB_STATS,
- sizeof(struct exofs_sb_stats));
-
-static int __sbi_read_stats(struct exofs_sb_info *sbi)
-{
- struct osd_attr attrs[] = {
- [0] = g_attr_sb_stats,
- };
- struct ore_io_state *ios;
- int ret;
-
- ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- return ret;
- }
-
- ios->in_attr = attrs;
- ios->in_attr_len = ARRAY_SIZE(attrs);
-
- ret = ore_read(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("Error reading super_block stats => %d\n", ret);
- goto out;
- }
-
- ret = extract_attr_from_ios(ios, &attrs[0]);
- if (ret) {
- EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
- goto out;
- }
- if (attrs[0].len) {
- struct exofs_sb_stats *ess;
-
- if (unlikely(attrs[0].len != sizeof(*ess))) {
- EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
- "size(%d) != expected(%zd)\n",
- __func__, attrs[0].len, sizeof(*ess));
- goto out;
- }
-
- ess = attrs[0].val_ptr;
- sbi->s_nextid = le64_to_cpu(ess->s_nextid);
- sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
- }
-
-out:
- ore_put_io_state(ios);
- return ret;
-}
-
-static void stats_done(struct ore_io_state *ios, void *p)
-{
- ore_put_io_state(ios);
- /* Good thanks nothing to do anymore */
-}
-
-/* Asynchronously write the stats attribute */
-int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
-{
- struct osd_attr attrs[] = {
- [0] = g_attr_sb_stats,
- };
- struct ore_io_state *ios;
- int ret;
-
- ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
- return ret;
- }
-
- sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid);
- sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
- attrs[0].val_ptr = &sbi->s_ess;
-
-
- ios->done = stats_done;
- ios->private = sbi;
- ios->out_attr = attrs;
- ios->out_attr_len = ARRAY_SIZE(attrs);
-
- ret = ore_write(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: ore_write failed.\n", __func__);
- ore_put_io_state(ios);
- }
-
- return ret;
-}
-
-/******************************************************************************
- * SUPERBLOCK FUNCTIONS
- *****************************************************************************/
-static const struct super_operations exofs_sops;
-static const struct export_operations exofs_export_ops;
-
-/*
- * Write the superblock to the OSD
- */
-static int exofs_sync_fs(struct super_block *sb, int wait)
-{
- struct exofs_sb_info *sbi;
- struct exofs_fscb *fscb;
- struct ore_comp one_comp;
- struct ore_components oc;
- struct ore_io_state *ios;
- int ret = -ENOMEM;
-
- fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
- if (unlikely(!fscb))
- return -ENOMEM;
-
- sbi = sb->s_fs_info;
-
- /* NOTE: We no longer dirty the super_block anywhere in exofs. The
- * reason we write the fscb here on unmount is so we can stay backwards
- * compatible with fscb->s_version == 1. (What we are not compatible
- * with is if a new version FS crashed and then we try to mount an old
- * version). Otherwise the exofs_fscb is read-only from mkfs time. All
- * the writeable info is set in exofs_sbi_write_stats() above.
- */
-
- exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
-
- ret = ore_get_io_state(&sbi->layout, &oc, &ios);
- if (unlikely(ret))
- goto out;
-
- ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
- memset(fscb, 0, ios->length);
- fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
- fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles);
- fscb->s_magic = cpu_to_le16(sb->s_magic);
- fscb->s_newfs = 0;
- fscb->s_version = EXOFS_FSCB_VER;
-
- ios->offset = 0;
- ios->kern_buff = fscb;
-
- ret = ore_write(ios);
- if (unlikely(ret))
- EXOFS_ERR("%s: ore_write failed.\n", __func__);
-
-out:
- EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
- ore_put_io_state(ios);
- kfree(fscb);
- return ret;
-}
-
-static void _exofs_print_device(const char *msg, const char *dev_path,
- struct osd_dev *od, u64 pid)
-{
- const struct osd_dev_info *odi = osduld_device_info(od);
-
- printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
- msg, dev_path ?: "", odi->osdname, _LLU(pid));
-}
-
-static void exofs_free_sbi(struct exofs_sb_info *sbi)
-{
- unsigned numdevs = sbi->oc.numdevs;
-
- while (numdevs) {
- unsigned i = --numdevs;
- struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
-
- if (od) {
- ore_comp_set_dev(&sbi->oc, i, NULL);
- osduld_put_device(od);
- }
- }
- kfree(sbi->oc.ods);
- kfree(sbi);
-}
-
-/*
- * This function is called when the vfs is freeing the superblock. We just
- * need to free our own part.
- */
-static void exofs_put_super(struct super_block *sb)
-{
- int num_pend;
- struct exofs_sb_info *sbi = sb->s_fs_info;
-
- /* make sure there are no pending commands */
- for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
- num_pend = atomic_read(&sbi->s_curr_pending)) {
- wait_queue_head_t wq;
-
- printk(KERN_NOTICE "%s: !!Pending operations in flight. "
- "This is a BUG. please report to osd-dev@open-osd.org\n",
- __func__);
- init_waitqueue_head(&wq);
- wait_event_timeout(wq,
- (atomic_read(&sbi->s_curr_pending) == 0),
- msecs_to_jiffies(100));
- }
-
- _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
- sbi->one_comp.obj.partition);
-
- exofs_sysfs_sb_del(sbi);
- exofs_free_sbi(sbi);
- sb->s_fs_info = NULL;
-}
-
-static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
- struct exofs_device_table *dt)
-{
- int ret;
-
- sbi->layout.stripe_unit =
- le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
- sbi->layout.group_width =
- le32_to_cpu(dt->dt_data_map.cb_group_width);
- sbi->layout.group_depth =
- le32_to_cpu(dt->dt_data_map.cb_group_depth);
- sbi->layout.mirrors_p1 =
- le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
- sbi->layout.raid_algorithm =
- le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
-
- ret = ore_verify_layout(numdevs, &sbi->layout);
-
- EXOFS_DBGMSG("exofs: layout: "
- "num_comps=%u stripe_unit=0x%x group_width=%u "
- "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n",
- numdevs,
- sbi->layout.stripe_unit,
- sbi->layout.group_width,
- _LLU(sbi->layout.group_depth),
- sbi->layout.mirrors_p1,
- sbi->layout.raid_algorithm);
- return ret;
-}
-
-static unsigned __ra_pages(struct ore_layout *layout)
-{
- const unsigned _MIN_RA = 32; /* min 128K read-ahead */
- unsigned ra_pages = layout->group_width * layout->stripe_unit /
- PAGE_SIZE;
- unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
-
- ra_pages *= 2; /* two stripes */
- if (ra_pages < _MIN_RA)
- ra_pages = roundup(_MIN_RA, ra_pages / 2);
-
- if (ra_pages > max_io_pages)
- ra_pages = max_io_pages;
-
- return ra_pages;
-}
-
-/* @odi is valid only as long as @fscb_dev is valid */
-static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
- struct osd_dev_info *odi)
-{
- odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
- if (likely(odi->systemid_len))
- memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN);
-
- odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
- odi->osdname = dt_dev->osdname;
-
- /* FIXME support long names. Will need a _put function */
- if (dt_dev->long_name_offset)
- return -EINVAL;
-
- /* Make sure osdname is printable!
- * mkexofs should give us space for a null-terminator else the
- * device-table is invalid.
- */
- if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
- odi->osdname_len = sizeof(dt_dev->osdname) - 1;
- dt_dev->osdname[odi->osdname_len] = 0;
-
- /* If it's all zeros something is bad we read past end-of-obj */
- return !(odi->systemid_len || odi->osdname_len);
-}
-
-static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
- struct exofs_dev **peds)
-{
- /* Twice bigger table: See exofs_init_comps() and comment at
- * exofs_read_lookup_dev_table()
- */
- const size_t numores = numdevs * 2 - 1;
- struct exofs_dev *eds;
- unsigned i;
-
- sbi->oc.ods = kzalloc(numores * sizeof(struct ore_dev *) +
- numdevs * sizeof(struct exofs_dev), GFP_KERNEL);
- if (unlikely(!sbi->oc.ods)) {
- EXOFS_ERR("ERROR: failed allocating Device array[%d]\n",
- numdevs);
- return -ENOMEM;
- }
-
- /* Start of allocated struct exofs_dev entries */
- *peds = eds = (void *)sbi->oc.ods[numores];
- /* Initialize pointers into struct exofs_dev */
- for (i = 0; i < numdevs; ++i)
- sbi->oc.ods[i] = &eds[i].ored;
- return 0;
-}
-
-static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
- struct osd_dev *fscb_od,
- unsigned table_count)
-{
- struct ore_comp comp;
- struct exofs_device_table *dt;
- struct exofs_dev *eds;
- unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
- sizeof(*dt);
- unsigned numdevs, i;
- int ret;
-
- dt = kmalloc(table_bytes, GFP_KERNEL);
- if (unlikely(!dt)) {
- EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
- table_bytes);
- return -ENOMEM;
- }
-
- sbi->oc.numdevs = 0;
-
- comp.obj.partition = sbi->one_comp.obj.partition;
- comp.obj.id = EXOFS_DEVTABLE_ID;
- exofs_make_credential(comp.cred, &comp.obj);
-
- ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt,
- table_bytes);
- if (unlikely(ret)) {
- EXOFS_ERR("ERROR: reading device table\n");
- goto out;
- }
-
- numdevs = le64_to_cpu(dt->dt_num_devices);
- if (unlikely(!numdevs)) {
- ret = -EINVAL;
- goto out;
- }
- WARN_ON(table_count != numdevs);
-
- ret = _read_and_match_data_map(sbi, numdevs, dt);
- if (unlikely(ret))
- goto out;
-
- ret = __alloc_dev_table(sbi, numdevs, &eds);
- if (unlikely(ret))
- goto out;
- /* exofs round-robins the device table view according to inode
- * number. We hold a: twice bigger table hence inodes can point
- * to any device and have a sequential view of the table
- * starting at this device. See exofs_init_comps()
- */
- memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
- (numdevs - 1) * sizeof(sbi->oc.ods[0]));
-
- /* create sysfs subdir under which we put the device table
- * And cluster layout. A Superblock is identified by the string:
- * "dev[0].osdname"_"pid"
- */
- exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]);
-
- for (i = 0; i < numdevs; i++) {
- struct exofs_fscb fscb;
- struct osd_dev_info odi;
- struct osd_dev *od;
-
- if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
- EXOFS_ERR("ERROR: Read all-zeros device entry\n");
- ret = -EINVAL;
- goto out;
- }
-
- printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
- i, odi.osdname);
-
- /* the exofs id is currently the table index */
- eds[i].did = i;
-
- /* On all devices the device table is identical. The user can
- * specify any one of the participating devices on the command
- * line. We always keep them in device-table order.
- */
- if (fscb_od && osduld_device_same(fscb_od, &odi)) {
- eds[i].ored.od = fscb_od;
- ++sbi->oc.numdevs;
- fscb_od = NULL;
- exofs_sysfs_odev_add(&eds[i], sbi);
- continue;
- }
-
- od = osduld_info_lookup(&odi);
- if (IS_ERR(od)) {
- ret = PTR_ERR(od);
- EXOFS_ERR("ERROR: device requested is not found "
- "osd_name-%s =>%d\n", odi.osdname, ret);
- goto out;
- }
-
- eds[i].ored.od = od;
- ++sbi->oc.numdevs;
-
- /* Read the fscb of the other devices to make sure the FS
- * partition is there.
- */
- ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb,
- sizeof(fscb));
- if (unlikely(ret)) {
- EXOFS_ERR("ERROR: Malformed participating device "
- "error reading fscb osd_name-%s\n",
- odi.osdname);
- goto out;
- }
- exofs_sysfs_odev_add(&eds[i], sbi);
-
- /* TODO: verify other information is correct and FS-uuid
- * matches. Benny what did you say about device table
- * generation and old devices?
- */
- }
-
-out:
- kfree(dt);
- if (unlikely(fscb_od && !ret)) {
- EXOFS_ERR("ERROR: Bad device-table container device not present\n");
- osduld_put_device(fscb_od);
- return -EINVAL;
- }
- return ret;
-}
-
-/*
- * Read the superblock from the OSD and fill in the fields
- */
-static int exofs_fill_super(struct super_block *sb,
- struct exofs_mountopt *opts,
- struct exofs_sb_info *sbi,
- int silent)
-{
- struct inode *root;
- struct osd_dev *od; /* Master device */
- struct exofs_fscb fscb; /*on-disk superblock info */
- struct ore_comp comp;
- unsigned table_count;
- int ret;
-
- /* use mount options to fill superblock */
- if (opts->is_osdname) {
- struct osd_dev_info odi = {.systemid_len = 0};
-
- odi.osdname_len = strlen(opts->dev_name);
- odi.osdname = (u8 *)opts->dev_name;
- od = osduld_info_lookup(&odi);
- kfree(opts->dev_name);
- opts->dev_name = NULL;
- } else {
- od = osduld_path_lookup(opts->dev_name);
- }
- if (IS_ERR(od)) {
- ret = -EINVAL;
- goto free_sbi;
- }
-
- /* Default layout in case we do not have a device-table */
- sbi->layout.stripe_unit = PAGE_SIZE;
- sbi->layout.mirrors_p1 = 1;
- sbi->layout.group_width = 1;
- sbi->layout.group_depth = -1;
- sbi->layout.group_count = 1;
- sbi->s_timeout = opts->timeout;
-
- sbi->one_comp.obj.partition = opts->pid;
- sbi->one_comp.obj.id = 0;
- exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
- sbi->oc.single_comp = EC_SINGLE_COMP;
- sbi->oc.comps = &sbi->one_comp;
-
- /* fill in some other data by hand */
- memset(sb->s_id, 0, sizeof(sb->s_id));
- strcpy(sb->s_id, "exofs");
- sb->s_blocksize = EXOFS_BLKSIZE;
- sb->s_blocksize_bits = EXOFS_BLKSHIFT;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_max_links = EXOFS_LINK_MAX;
- atomic_set(&sbi->s_curr_pending, 0);
- sb->s_bdev = NULL;
- sb->s_dev = 0;
-
- comp.obj.partition = sbi->one_comp.obj.partition;
- comp.obj.id = EXOFS_SUPER_ID;
- exofs_make_credential(comp.cred, &comp.obj);
-
- ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb));
- if (unlikely(ret))
- goto free_sbi;
-
- sb->s_magic = le16_to_cpu(fscb.s_magic);
- /* NOTE: we read below to be backward compatible with old versions */
- sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
- sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
-
- /* make sure what we read from the object store is correct */
- if (sb->s_magic != EXOFS_SUPER_MAGIC) {
- if (!silent)
- EXOFS_ERR("ERROR: Bad magic value\n");
- ret = -EINVAL;
- goto free_sbi;
- }
- if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
- EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
- EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
- ret = -EINVAL;
- goto free_sbi;
- }
-
- /* start generation numbers from a random point */
- get_random_bytes(&sbi->s_next_generation, sizeof(u32));
- spin_lock_init(&sbi->s_next_gen_lock);
-
- table_count = le64_to_cpu(fscb.s_dev_table_count);
- if (table_count) {
- ret = exofs_read_lookup_dev_table(sbi, od, table_count);
- if (unlikely(ret))
- goto free_sbi;
- } else {
- struct exofs_dev *eds;
-
- ret = __alloc_dev_table(sbi, 1, &eds);
- if (unlikely(ret))
- goto free_sbi;
-
- ore_comp_set_dev(&sbi->oc, 0, od);
- sbi->oc.numdevs = 1;
- }
-
- __sbi_read_stats(sbi);
-
- /* set up operation vectors */
- ret = super_setup_bdi(sb);
- if (ret) {
- EXOFS_DBGMSG("Failed to super_setup_bdi\n");
- goto free_sbi;
- }
- sb->s_bdi->ra_pages = __ra_pages(&sbi->layout);
- sb->s_fs_info = sbi;
- sb->s_op = &exofs_sops;
- sb->s_export_op = &exofs_export_ops;
- root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
- if (IS_ERR(root)) {
- EXOFS_ERR("ERROR: exofs_iget failed\n");
- ret = PTR_ERR(root);
- goto free_sbi;
- }
- sb->s_root = d_make_root(root);
- if (!sb->s_root) {
- EXOFS_ERR("ERROR: get root inode failed\n");
- ret = -ENOMEM;
- goto free_sbi;
- }
-
- if (!S_ISDIR(root->i_mode)) {
- dput(sb->s_root);
- sb->s_root = NULL;
- EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
- root->i_mode);
- ret = -EINVAL;
- goto free_sbi;
- }
-
- exofs_sysfs_dbg_print();
- _exofs_print_device("Mounting", opts->dev_name,
- ore_comp_dev(&sbi->oc, 0),
- sbi->one_comp.obj.partition);
- return 0;
-
-free_sbi:
- EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
- opts->dev_name, sbi->one_comp.obj.partition, ret);
- exofs_free_sbi(sbi);
- return ret;
-}
-
-/*
- * Set up the superblock (calls exofs_fill_super eventually)
- */
-static struct dentry *exofs_mount(struct file_system_type *type,
- int flags, const char *dev_name,
- void *data)
-{
- struct super_block *s;
- struct exofs_mountopt opts;
- struct exofs_sb_info *sbi;
- int ret;
-
- ret = parse_options(data, &opts);
- if (ret) {
- kfree(opts.dev_name);
- return ERR_PTR(ret);
- }
-
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi) {
- kfree(opts.dev_name);
- return ERR_PTR(-ENOMEM);
- }
-
- s = sget(type, NULL, set_anon_super, flags, NULL);
-
- if (IS_ERR(s)) {
- kfree(opts.dev_name);
- kfree(sbi);
- return ERR_CAST(s);
- }
-
- if (!opts.dev_name)
- opts.dev_name = dev_name;
-
-
- ret = exofs_fill_super(s, &opts, sbi, flags & SB_SILENT ? 1 : 0);
- if (ret) {
- deactivate_locked_super(s);
- return ERR_PTR(ret);
- }
- s->s_flags |= SB_ACTIVE;
- return dget(s->s_root);
-}
-
-/*
- * Return information about the file system state in the buffer. This is used
- * by the 'df' command, for example.
- */
-static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- struct super_block *sb = dentry->d_sb;
- struct exofs_sb_info *sbi = sb->s_fs_info;
- struct ore_io_state *ios;
- struct osd_attr attrs[] = {
- ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
- OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
- ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
- OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
- };
- uint64_t capacity = ULLONG_MAX;
- uint64_t used = ULLONG_MAX;
- int ret;
-
- ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
- if (ret) {
- EXOFS_DBGMSG("ore_get_io_state failed.\n");
- return ret;
- }
-
- ios->in_attr = attrs;
- ios->in_attr_len = ARRAY_SIZE(attrs);
-
- ret = ore_read(ios);
- if (unlikely(ret))
- goto out;
-
- ret = extract_attr_from_ios(ios, &attrs[0]);
- if (likely(!ret)) {
- capacity = get_unaligned_be64(attrs[0].val_ptr);
- if (unlikely(!capacity))
- capacity = ULLONG_MAX;
- } else
- EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
-
- ret = extract_attr_from_ios(ios, &attrs[1]);
- if (likely(!ret))
- used = get_unaligned_be64(attrs[1].val_ptr);
- else
- EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
-
- /* fill in the stats buffer */
- buf->f_type = EXOFS_SUPER_MAGIC;
- buf->f_bsize = EXOFS_BLKSIZE;
- buf->f_blocks = capacity >> 9;
- buf->f_bfree = (capacity - used) >> 9;
- buf->f_bavail = buf->f_bfree;
- buf->f_files = sbi->s_numfiles;
- buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
- buf->f_namelen = EXOFS_NAME_LEN;
-
-out:
- ore_put_io_state(ios);
- return ret;
-}
-
-static const struct super_operations exofs_sops = {
- .alloc_inode = exofs_alloc_inode,
- .destroy_inode = exofs_destroy_inode,
- .write_inode = exofs_write_inode,
- .evict_inode = exofs_evict_inode,
- .put_super = exofs_put_super,
- .sync_fs = exofs_sync_fs,
- .statfs = exofs_statfs,
-};
-
-/******************************************************************************
- * EXPORT OPERATIONS
- *****************************************************************************/
-
-static struct dentry *exofs_get_parent(struct dentry *child)
-{
- unsigned long ino = exofs_parent_ino(child);
-
- if (!ino)
- return ERR_PTR(-ESTALE);
-
- return d_obtain_alias(exofs_iget(child->d_sb, ino));
-}
-
-static struct inode *exofs_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
-{
- struct inode *inode;
-
- inode = exofs_iget(sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- if (generation && inode->i_generation != generation) {
- /* we didn't find the right inode.. */
- iput(inode);
- return ERR_PTR(-ESTALE);
- }
- return inode;
-}
-
-static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
- struct fid *fid, int fh_len, int fh_type)
-{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- exofs_nfs_get_inode);
-}
-
-static struct dentry *exofs_fh_to_parent(struct super_block *sb,
- struct fid *fid, int fh_len, int fh_type)
-{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- exofs_nfs_get_inode);
-}
-
-static const struct export_operations exofs_export_ops = {
- .fh_to_dentry = exofs_fh_to_dentry,
- .fh_to_parent = exofs_fh_to_parent,
- .get_parent = exofs_get_parent,
-};
-
-/******************************************************************************
- * INSMOD/RMMOD
- *****************************************************************************/
-
-/*
- * struct that describes this file system
- */
-static struct file_system_type exofs_type = {
- .owner = THIS_MODULE,
- .name = "exofs",
- .mount = exofs_mount,
- .kill_sb = generic_shutdown_super,
-};
-MODULE_ALIAS_FS("exofs");
-
-static int __init init_exofs(void)
-{
- int err;
-
- err = init_inodecache();
- if (err)
- goto out;
-
- err = register_filesystem(&exofs_type);
- if (err)
- goto out_d;
-
- /* We don't fail if sysfs creation failed */
- exofs_sysfs_init();
-
- return 0;
-out_d:
- destroy_inodecache();
-out:
- return err;
-}
-
-static void __exit exit_exofs(void)
-{
- exofs_sysfs_uninit();
- unregister_filesystem(&exofs_type);
- destroy_inodecache();
-}
-
-MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
-MODULE_DESCRIPTION("exofs");
-MODULE_LICENSE("GPL");
-
-module_init(init_exofs)
-module_exit(exit_exofs)
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
deleted file mode 100644
index 1f7d5e46cdda..000000000000
--- a/fs/exofs/sys.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (C) 2012
- * Sachin Bhamare <sbhamare@panasas.com>
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License 2 as published by
- * the Free Software Foundation.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the:
- * Free Software Foundation <licensing@fsf.org>
- */
-
-#include <linux/kobject.h>
-#include <linux/device.h>
-
-#include "exofs.h"
-
-struct odev_attr {
- struct attribute attr;
- ssize_t (*show)(struct exofs_dev *, char *);
- ssize_t (*store)(struct exofs_dev *, const char *, size_t);
-};
-
-static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr,
- char *buf)
-{
- struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
- struct odev_attr *a = container_of(attr, struct odev_attr, attr);
-
- return a->show ? a->show(edp, buf) : 0;
-}
-
-static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t len)
-{
- struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
- struct odev_attr *a = container_of(attr, struct odev_attr, attr);
-
- return a->store ? a->store(edp, buf, len) : len;
-}
-
-static const struct sysfs_ops odev_attr_ops = {
- .show = odev_attr_show,
- .store = odev_attr_store,
-};
-
-
-static struct kset *exofs_kset;
-
-static ssize_t osdname_show(struct exofs_dev *edp, char *buf)
-{
- struct osd_dev *odev = edp->ored.od;
- const struct osd_dev_info *odi = osduld_device_info(odev);
-
- return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname);
-}
-
-static ssize_t systemid_show(struct exofs_dev *edp, char *buf)
-{
- struct osd_dev *odev = edp->ored.od;
- const struct osd_dev_info *odi = osduld_device_info(odev);
-
- memcpy(buf, odi->systemid, odi->systemid_len);
- return odi->systemid_len;
-}
-
-static ssize_t uri_show(struct exofs_dev *edp, char *buf)
-{
- return snprintf(buf, edp->urilen, "%s", edp->uri);
-}
-
-static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len)
-{
- uint8_t *new_uri;
-
- edp->urilen = strlen(buf) + 1;
- new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL);
- if (new_uri == NULL)
- return -ENOMEM;
- edp->uri = new_uri;
- strncpy(edp->uri, buf, edp->urilen);
- return edp->urilen;
-}
-
-#define OSD_ATTR(name, mode, show, store) \
- static struct odev_attr odev_attr_##name = \
- __ATTR(name, mode, show, store)
-
-OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL);
-OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL);
-OSD_ATTR(uri, S_IRWXU, uri_show, uri_store);
-
-static struct attribute *odev_attrs[] = {
- &odev_attr_osdname.attr,
- &odev_attr_systemid.attr,
- &odev_attr_uri.attr,
- NULL,
-};
-
-static struct kobj_type odev_ktype = {
- .default_attrs = odev_attrs,
- .sysfs_ops = &odev_attr_ops,
-};
-
-static struct kobj_type uuid_ktype = {
-};
-
-void exofs_sysfs_dbg_print(void)
-{
-#ifdef CONFIG_EXOFS_DEBUG
- struct kobject *k_name, *k_tmp;
-
- list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
- printk(KERN_INFO "%s: name %s ref %d\n",
- __func__, kobject_name(k_name),
- (int)kref_read(&k_name->kref));
- }
-#endif
-}
-/*
- * This function removes all kobjects under exofs_kset
- * At the end of it, exofs_kset kobject will have a refcount
- * of 1 which gets decremented only on exofs module unload
- */
-void exofs_sysfs_sb_del(struct exofs_sb_info *sbi)
-{
- struct kobject *k_name, *k_tmp;
- struct kobject *s_kobj = &sbi->s_kobj;
-
- list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
- /* Remove all that are children of this SBI */
- if (k_name->parent == s_kobj)
- kobject_put(k_name);
- }
- kobject_put(s_kobj);
-}
-
-/*
- * This function creates sysfs entries to hold the current exofs cluster
- * instance (uniquely identified by osdname,pid tuple).
- * This function gets called once per exofs mount instance.
- */
-int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
- struct exofs_dt_device_info *dt_dev)
-{
- struct kobject *s_kobj;
- int retval = 0;
- uint64_t pid = sbi->one_comp.obj.partition;
-
- /* allocate new uuid dirent */
- s_kobj = &sbi->s_kobj;
- s_kobj->kset = exofs_kset;
- retval = kobject_init_and_add(s_kobj, &uuid_ktype,
- &exofs_kset->kobj, "%s_%llx", dt_dev->osdname, pid);
- if (retval) {
- EXOFS_ERR("ERROR: Failed to create sysfs entry for "
- "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval);
- return -ENOMEM;
- }
- return 0;
-}
-
-int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi)
-{
- struct kobject *d_kobj;
- int retval = 0;
-
- /* create osd device group which contains following attributes
- * osdname, systemid & uri
- */
- d_kobj = &edev->ed_kobj;
- d_kobj->kset = exofs_kset;
- retval = kobject_init_and_add(d_kobj, &odev_ktype,
- &sbi->s_kobj, "dev%u", edev->did);
- if (retval) {
- EXOFS_ERR("ERROR: Failed to create sysfs entry for "
- "device dev%u\n", edev->did);
- return retval;
- }
- return 0;
-}
-
-int exofs_sysfs_init(void)
-{
- exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj);
- if (!exofs_kset) {
- EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n");
- return -ENOMEM;
- }
- return 0;
-}
-
-void exofs_sysfs_uninit(void)
-{
- kset_unregister(exofs_kset);
-}
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index a453cc87082b..06f77ca7f36e 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -96,23 +96,8 @@ config EXT4_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
-config EXT4_ENCRYPTION
- bool "Ext4 Encryption"
- depends on EXT4_FS
- select FS_ENCRYPTION
- help
- Enable encryption of ext4 files and directories. This
- feature is similar to ecryptfs, but it is more memory
- efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
-
-config EXT4_FS_ENCRYPTION
- bool
- default y
- depends on EXT4_ENCRYPTION
-
config EXT4_DEBUG
- bool "EXT4 debugging support"
+ bool "Ext4 debugging support"
depends on EXT4_FS
help
Enables run-time debugging support for the ext4 filesystem.
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f93f9881ec18..0ccd51f72048 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -111,7 +111,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
int dir_has_error = 0;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_get_encryption_info(inode);
if (err && err != -ENOKEY)
return err;
@@ -138,7 +138,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
return err;
}
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
if (err < 0)
return err;
@@ -245,7 +245,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
offset += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
- if (!ext4_encrypted_inode(inode)) {
+ if (!IS_ENCRYPTED(inode)) {
if (!dir_emit(ctx, de->name,
de->name_len,
le32_to_cpu(de->inode),
@@ -283,9 +283,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
done:
err = 0;
errout:
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
fscrypt_fname_free_buffer(&fstr);
-#endif
brelse(bh);
return err;
}
@@ -613,7 +611,7 @@ finished:
static int ext4_dir_open(struct inode * inode, struct file * filp)
{
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
return 0;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 185a05d3257e..0833b5fc0668 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -40,7 +40,6 @@
#include <linux/compat.h>
#endif
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
#include <linux/fscrypt.h>
#include <linux/compiler.h>
@@ -426,6 +425,9 @@ struct flex_groups {
/* Flags that are appropriate for non-directories/regular files. */
#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+/* The only flags that should be swapped */
+#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
@@ -1326,7 +1328,7 @@ struct ext4_super_block {
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
EXT4_MF_TEST_DUMMY_ENCRYPTION))
#else
@@ -1662,6 +1664,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
+extern void ext4_update_dynamic_rev(struct super_block *sb);
+
#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
@@ -1670,6 +1674,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_compat |= \
cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
} \
@@ -1687,6 +1692,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
} \
@@ -1704,6 +1710,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_incompat |= \
cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
} \
@@ -2017,7 +2024,6 @@ static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
desc.shash.tfm = sbi->s_chksum_driver;
- desc.shash.flags = 0;
*(u32 *)desc.ctx = crc;
BUG_ON(crypto_shash_update(&desc.shash, address, length));
@@ -2051,7 +2057,7 @@ struct ext4_filename {
const struct qstr *usr_fname;
struct fscrypt_str disk_name;
struct dx_hash_info hinfo;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_str crypto_buf;
#endif
};
@@ -2279,12 +2285,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-static inline bool ext4_encrypted_inode(struct inode *inode)
-{
- return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
-}
-
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static inline int ext4_fname_setup_filename(struct inode *dir,
const struct qstr *iname,
int lookup, struct ext4_filename *fname)
@@ -2672,7 +2673,6 @@ do { \
#endif
-extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
extern int ext4_update_rocompat_feature(handle_t *handle,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 15b6dd733780..75a5309f2231 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -384,7 +384,7 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
{
struct ext4_inode_info *ei = EXT4_I(inode);
- if (ext4_handle_valid(handle)) {
+ if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
if (datasync)
ei->i_datasync_tid = handle->h_transaction->t_tid;
@@ -411,7 +411,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode)
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
!test_opt(inode->i_sb, DELALLOC))) {
/* We do not support data journalling for encrypted data */
- if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode))
+ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 240b6dea5441..0f89f5190cd7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2956,14 +2956,17 @@ again:
if (err < 0)
goto out;
- } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
+ } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
+ partial.state == initial) {
/*
- * If there's an extent to the right its first cluster
- * contains the immediate right boundary of the
- * truncated/punched region. Set partial_cluster to
- * its negative value so it won't be freed if shared
- * with the current extent. The end < ee_block case
- * is handled in ext4_ext_rm_leaf().
+ * If we're punching, there's an extent to the right.
+ * If the partial cluster hasn't been set, set it to
+ * that extent's first cluster and its state to nofree
+ * so it won't be freed should it contain blocks to be
+ * removed. If it's already set (tofree/nofree), we're
+ * retrying and keep the original partial cluster info
+ * so a cluster marked tofree as a result of earlier
+ * extent removal is not lost.
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
@@ -3631,7 +3634,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
max_zeroout = 0;
/*
@@ -4048,18 +4051,8 @@ out:
} else
allocated = ret;
map->m_flags |= EXT4_MAP_NEW;
- /*
- * if we allocated more blocks than requested
- * we need to make sure we unmap the extra block
- * allocated. The actual needed block will get
- * unmapped later when we find the buffer_head marked
- * new.
- */
- if (allocated > map->m_len) {
- clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len,
- allocated - map->m_len);
+ if (allocated > map->m_len)
allocated = map->m_len;
- }
map->m_len = allocated;
map_out:
@@ -4818,7 +4811,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
* leave it disabled for encrypted inodes for now. This is a
* bug we should fix....
*/
- if (ext4_encrypted_inode(inode) &&
+ if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
FALLOC_FL_ZERO_RANGE)))
return -EOPNOTSUPP;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 69d65d49837b..98ec11f69cd4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -125,7 +125,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
struct super_block *sb = inode->i_sb;
int blockmask = sb->s_blocksize - 1;
- if (pos >= i_size_read(inode))
+ if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
return 0;
if ((pos | iov_iter_alignment(from)) & blockmask)
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index e22dcfab308b..46b24da33a28 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -231,6 +231,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
break;
case DX_HASH_HALF_MD4_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
+ /* fall through */
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
@@ -244,6 +245,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
break;
case DX_HASH_TEA_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
+ /* fall through */
case DX_HASH_TEA:
p = name;
while (len > 0) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 7ff14a1adba3..f3e17a8c84b4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -771,7 +771,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (unlikely(ext4_forced_shutdown(sbi)))
return ERR_PTR(-EIO);
- if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+ if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
!(i_flags & EXT4_EA_INODE_FL)) {
err = fscrypt_get_encryption_info(dir);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index bf7fa1507e81..2024d3fa5504 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1183,18 +1183,21 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_IND_BLOCK:
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_DIND_BLOCK:
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_TIND_BLOCK:
;
}
@@ -1219,6 +1222,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
ext4_lblk_t offsets[4], offsets2[4];
Indirect chain[4], chain2[4];
Indirect *partial, *partial2;
+ Indirect *p = NULL, *p2 = NULL;
ext4_lblk_t max_block;
__le32 nr = 0, nr2 = 0;
int n = 0, n2 = 0;
@@ -1260,7 +1264,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
}
- partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+ partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
if (nr) {
if (partial == chain) {
/* Shared branch grows from the inode */
@@ -1285,13 +1289,11 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
partial->p + 1,
(__le32 *)partial->bh->b_data+addr_per_block,
(chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
partial--;
}
end_range:
- partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+ partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
if (nr2) {
if (partial2 == chain2) {
/*
@@ -1321,16 +1323,14 @@ end_range:
(__le32 *)partial2->bh->b_data,
partial2->p,
(chain2+n2-1) - partial2);
- BUFFER_TRACE(partial2->bh, "call brelse");
- brelse(partial2->bh);
partial2--;
}
goto do_indirects;
}
/* Punch happened within the same level (n == n2) */
- partial = ext4_find_shared(inode, n, offsets, chain, &nr);
- partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+ partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
+ partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
/* Free top, but only if partial2 isn't its subtree. */
if (nr) {
@@ -1387,11 +1387,7 @@ end_range:
partial->p + 1,
partial2->p,
(chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- BUFFER_TRACE(partial2->bh, "call brelse");
- brelse(partial2->bh);
- return 0;
+ goto cleanup;
}
/*
@@ -1406,8 +1402,6 @@ end_range:
partial->p + 1,
(__le32 *)partial->bh->b_data+addr_per_block,
(chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
partial--;
}
if (partial2 > chain2 && depth2 <= depth) {
@@ -1415,11 +1409,21 @@ end_range:
(__le32 *)partial2->bh->b_data,
partial2->p,
(chain2+n2-1) - partial2);
- BUFFER_TRACE(partial2->bh, "call brelse");
- brelse(partial2->bh);
partial2--;
}
}
+
+cleanup:
+ while (p && p > chain) {
+ BUFFER_TRACE(p->bh, "call brelse");
+ brelse(p->bh);
+ p--;
+ }
+ while (p2 && p2 > chain2) {
+ BUFFER_TRACE(p2->bh, "call brelse");
+ brelse(p2->bh);
+ p2--;
+ }
return 0;
do_indirects:
@@ -1427,30 +1431,33 @@ do_indirects:
switch (offsets[0]) {
default:
if (++n >= n2)
- return 0;
+ break;
nr = i_data[EXT4_IND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_IND_BLOCK:
if (++n >= n2)
- return 0;
+ break;
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_DIND_BLOCK:
if (++n >= n2)
- return 0;
+ break;
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_TIND_BLOCK:
;
}
- return 0;
+ goto cleanup;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 34d7e0703cc6..b32a57bc5d5d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -391,7 +391,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
* inode's preallocations.
*/
if ((ei->i_reserved_data_blocks == 0) &&
- (atomic_read(&inode->i_writecount) == 0))
+ !inode_is_open_for_write(inode))
ext4_discard_preallocations(inode);
}
@@ -415,7 +415,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
{
int ret;
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
return fscrypt_zeroout_range(inode, lblk, pblk, len);
ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
@@ -678,8 +678,6 @@ found:
if (flags & EXT4_GET_BLOCKS_ZERO &&
map->m_flags & EXT4_MAP_MAPPED &&
map->m_flags & EXT4_MAP_NEW) {
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
- map->m_len);
ret = ext4_issue_zeroout(inode, map->m_lblk,
map->m_pblk, map->m_len);
if (ret) {
@@ -1150,7 +1148,7 @@ int do_journal_get_write_access(handle_t *handle,
return ret;
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
@@ -1194,7 +1192,6 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (err)
break;
if (buffer_new(bh)) {
- clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
@@ -1217,8 +1214,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
(block_start < from || block_end > to)) {
ll_rw_block(REQ_OP_READ, 0, 1, &bh);
*wait_bh++ = bh;
- decrypt = ext4_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode);
+ decrypt = IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
}
}
/*
@@ -1303,7 +1299,7 @@ retry_journal:
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode))
ret = ext4_block_write_begin(page, pos, len,
ext4_get_block_unwritten);
@@ -2490,10 +2486,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
}
BUG_ON(map->m_len == 0);
- if (map->m_flags & EXT4_MAP_NEW) {
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
- map->m_len);
- }
return 0;
}
@@ -2836,12 +2828,12 @@ retry:
goto unplug;
}
ret = mpage_prepare_extent_to_map(&mpd);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, false);
/* Submit prepared bio */
ext4_io_submit(&mpd.io_submit);
ext4_put_io_end_defer(mpd.io_submit.io_end);
mpd.io_submit.io_end = NULL;
- /* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, false);
if (ret < 0)
goto unplug;
@@ -2909,10 +2901,11 @@ retry:
handle = NULL;
mpd.do_map = 0;
}
- /* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
/* Unlock pages we didn't use */
mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+
/*
* Drop our io_end reference we got from init. We have
* to be careful and use deferred io_end finishing if
@@ -3105,7 +3098,7 @@ retry_journal:
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
ret = ext4_block_write_begin(page, pos, len,
ext4_da_get_block_prep);
#else
@@ -3880,8 +3873,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t offset = iocb->ki_pos;
ssize_t ret;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+#ifdef CONFIG_FS_ENCRYPTION
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
return 0;
#endif
@@ -4065,8 +4058,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
goto unlock;
- if (S_ISREG(inode->i_mode) &&
- ext4_encrypted_inode(inode)) {
+ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
BUG_ON(blocksize != PAGE_SIZE);
@@ -4142,7 +4134,7 @@ static int ext4_block_truncate_page(handle_t *handle,
struct inode *inode = mapping->host;
/* If we are processing an encrypted inode during orphan list handling */
- if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode))
+ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
return 0;
blocksize = inode->i_sb->s_blocksize;
@@ -4722,7 +4714,7 @@ static bool ext4_should_use_dax(struct inode *inode)
return false;
if (ext4_has_inline_data(inode))
return false;
- if (ext4_encrypted_inode(inode))
+ if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
return false;
return true;
}
@@ -5072,7 +5064,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations;
ext4_set_aops(inode);
} else if (ext4_inode_is_fast_symlink(inode)) {
@@ -5351,7 +5343,6 @@ static int ext4_do_update_inode(handle_t *handle,
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
if (err)
goto out_brelse;
- ext4_update_dynamic_rev(sb);
ext4_set_feature_large_file(sb);
ext4_handle_sync(handle);
err = ext4_handle_dirty_super(handle, sb);
@@ -6002,7 +5993,7 @@ int ext4_expand_extra_isize(struct inode *inode,
ext4_write_lock_xattr(inode, &no_expand);
- BUFFER_TRACE(iloc.bh, "get_write_access");
+ BUFFER_TRACE(iloc->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, iloc->bh);
if (error) {
brelse(iloc->bh);
@@ -6089,36 +6080,6 @@ out:
return;
}
-#if 0
-/*
- * Bind an inode's backing buffer_head into this transaction, to prevent
- * it from being flushed to disk early. Unlike
- * ext4_reserve_inode_write, this leaves behind no bh reference and
- * returns no iloc structure, so the caller needs to repeat the iloc
- * lookup to mark the inode dirty later.
- */
-static int ext4_pin_inode(handle_t *handle, struct inode *inode)
-{
- struct ext4_iloc iloc;
-
- int err = 0;
- if (handle) {
- err = ext4_get_inode_loc(inode, &iloc);
- if (!err) {
- BUFFER_TRACE(iloc.bh, "get_write_access");
- err = jbd2_journal_get_write_access(handle, iloc.bh);
- if (!err)
- err = ext4_handle_dirty_metadata(handle,
- NULL,
- iloc.bh);
- brelse(iloc.bh);
- }
- }
- ext4_std_error(inode->i_sb, err);
- return err;
-}
-#endif
-
int ext4_change_inode_journal_flag(struct inode *inode, int val)
{
journal_t *journal;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d37dafa1d133..bab3da4f1e0d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -63,18 +63,20 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
loff_t isize;
struct ext4_inode_info *ei1;
struct ext4_inode_info *ei2;
+ unsigned long tmp;
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
swap(inode1->i_version, inode2->i_version);
- swap(inode1->i_blocks, inode2->i_blocks);
- swap(inode1->i_bytes, inode2->i_bytes);
swap(inode1->i_atime, inode2->i_atime);
swap(inode1->i_mtime, inode2->i_mtime);
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
- swap(ei1->i_flags, ei2->i_flags);
+ tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
+ ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) |
+ (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP);
+ ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP);
swap(ei1->i_disksize, ei2->i_disksize);
ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
@@ -115,28 +117,42 @@ static long swap_inode_boot_loader(struct super_block *sb,
int err;
struct inode *inode_bl;
struct ext4_inode_info *ei_bl;
-
- if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
- IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
- ext4_has_inline_data(inode))
- return -EINVAL;
-
- if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
- !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ qsize_t size, size_bl, diff;
+ blkcnt_t blocks;
+ unsigned short bytes;
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
if (IS_ERR(inode_bl))
return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
- filemap_flush(inode->i_mapping);
- filemap_flush(inode_bl->i_mapping);
-
/* Protect orig inodes against a truncate and make sure,
* that only 1 swap_inode_boot_loader is running. */
lock_two_nondirectories(inode, inode_bl);
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
+ IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
+ (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) ||
+ ext4_has_inline_data(inode)) {
+ err = -EINVAL;
+ goto journal_err_out;
+ }
+
+ if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
+ !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto journal_err_out;
+ }
+
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto err_out;
+
+ err = filemap_write_and_wait(inode_bl->i_mapping);
+ if (err)
+ goto err_out;
+
/* Wait for all existing dio workers */
inode_dio_wait(inode);
inode_dio_wait(inode_bl);
@@ -147,7 +163,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
if (IS_ERR(handle)) {
err = -EINVAL;
- goto journal_err_out;
+ goto err_out;
}
/* Protect extent tree against block allocations via delalloc */
@@ -170,6 +186,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
}
+ err = dquot_initialize(inode);
+ if (err)
+ goto err_out1;
+
+ size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes;
+ size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes;
+ diff = size - size_bl;
swap_inode_data(inode, inode_bl);
inode->i_ctime = inode_bl->i_ctime = current_time(inode);
@@ -183,34 +206,58 @@ static long swap_inode_boot_loader(struct super_block *sb,
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
+ /* No need to update quota information. */
ext4_warning(inode->i_sb,
"couldn't mark inode #%lu dirty (err %d)",
inode->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
ext4_mark_inode_dirty(handle, inode);
- } else {
- err = ext4_mark_inode_dirty(handle, inode_bl);
- if (err < 0) {
- ext4_warning(inode_bl->i_sb,
- "couldn't mark inode #%lu dirty (err %d)",
- inode_bl->i_ino, err);
- /* Revert all changes: */
- swap_inode_data(inode, inode_bl);
- ext4_mark_inode_dirty(handle, inode);
- ext4_mark_inode_dirty(handle, inode_bl);
- }
+ goto err_out1;
}
+
+ blocks = inode_bl->i_blocks;
+ bytes = inode_bl->i_bytes;
+ inode_bl->i_blocks = inode->i_blocks;
+ inode_bl->i_bytes = inode->i_bytes;
+ err = ext4_mark_inode_dirty(handle, inode_bl);
+ if (err < 0) {
+ /* No need to update quota information. */
+ ext4_warning(inode_bl->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode_bl->i_ino, err);
+ goto revert;
+ }
+
+ /* Bootloader inode should not be counted into quota information. */
+ if (diff > 0)
+ dquot_free_space(inode, diff);
+ else
+ err = dquot_alloc_space(inode, -1 * diff);
+
+ if (err < 0) {
+revert:
+ /* Revert all changes: */
+ inode_bl->i_blocks = blocks;
+ inode_bl->i_bytes = bytes;
+ swap_inode_data(inode, inode_bl);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_mark_inode_dirty(handle, inode_bl);
+ }
+
+err_out1:
ext4_journal_stop(handle);
ext4_double_up_write_data_sem(inode, inode_bl);
+err_out:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
journal_err_out:
unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
return err;
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int uuid_is_zero(__u8 u[16])
{
int i;
@@ -953,6 +1000,13 @@ resizefs_out:
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
+ /*
+ * We haven't replayed the journal, so we cannot use our
+ * block-bitmap-guided storage zapping commands.
+ */
+ if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb))
+ return -EROFS;
+
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
@@ -978,7 +1032,7 @@ resizefs_out:
return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
int err, err2;
struct ext4_sb_info *sbi = EXT4_SB(sb);
handle_t *handle;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e2248083cdca..6fb76d408093 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4176,9 +4176,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
- if ((size == isize) &&
- !ext4_fs_is_busy(sbi) &&
- (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ if ((size == isize) && !ext4_fs_is_busy(sbi) &&
+ !inode_is_open_for_write(ac->ac_inode)) {
ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
}
@@ -4258,7 +4257,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
(unsigned) ar->lleft, (unsigned) ar->pleft,
(unsigned) ar->lright, (unsigned) ar->pright,
- atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+ inode_is_open_for_write(ar->inode) ? "" : "non-");
return 0;
}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2f5be02fc6f6..1083a9f3f16a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -592,8 +592,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
return -EOPNOTSUPP;
}
- if (ext4_encrypted_inode(orig_inode) ||
- ext4_encrypted_inode(donor_inode)) {
+ if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
ext4_msg(orig_inode->i_sb, KERN_ERR,
"Online defrag not supported for encrypted files");
return -EOPNOTSUPP;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2b928eb07fa2..980166a8122a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -612,7 +612,7 @@ static struct stats dx_show_leaf(struct inode *dir,
{
if (show_names)
{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
int len;
char *name;
struct fscrypt_str fname_crypto_str =
@@ -621,7 +621,7 @@ static struct stats dx_show_leaf(struct inode *dir,
name = de->name;
len = de->name_len;
- if (ext4_encrypted_inode(dir))
+ if (IS_ENCRYPTED(dir))
res = fscrypt_get_encryption_info(dir);
if (res) {
printk(KERN_WARNING "Error setting up"
@@ -984,9 +984,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
/* Check if the directory is encrypted */
- if (ext4_encrypted_inode(dir)) {
+ if (IS_ENCRYPTED(dir)) {
err = fscrypt_get_encryption_info(dir);
if (err < 0) {
brelse(bh);
@@ -1015,7 +1015,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
continue;
if (de->inode == 0)
continue;
- if (!ext4_encrypted_inode(dir)) {
+ if (!IS_ENCRYPTED(dir)) {
tmp_str.name = de->name;
tmp_str.len = de->name_len;
err = ext4_htree_store_dirent(dir_file,
@@ -1047,7 +1047,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
}
errout:
brelse(bh);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
fscrypt_fname_free_buffer(&fname_crypto_str);
#endif
return count;
@@ -1267,7 +1267,7 @@ static inline bool ext4_match(const struct ext4_filename *fname,
f.usr_fname = fname->usr_fname;
f.disk_name = fname->disk_name;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
f.crypto_buf = fname->crypto_buf;
#endif
return fscrypt_match_name(&f, de->name, de->name_len);
@@ -1498,7 +1498,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
ext4_lblk_t block;
int retval;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
*res_dir = NULL;
#endif
frame = dx_probe(fname, dir, NULL, frames);
@@ -1578,7 +1578,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
ino);
return ERR_PTR(-EFSCORRUPTED);
}
- if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
+ if (!IS_ERR(inode) && IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
ext4_warning(inode->i_sb,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index cff4c4aa7a9c..3e9298e6a705 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -67,7 +67,7 @@ static void ext4_finish_bio(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
struct page *data_page = NULL;
#endif
struct buffer_head *bh, *head;
@@ -79,7 +79,7 @@ static void ext4_finish_bio(struct bio *bio)
if (!page)
continue;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (!page->mapping) {
/* The bounce data pages are unmapped. */
data_page = page;
@@ -112,7 +112,7 @@ static void ext4_finish_bio(struct bio *bio)
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
local_irq_restore(flags);
if (!under_io) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (data_page)
fscrypt_restore_control_page(data_page);
#endif
@@ -468,18 +468,15 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
ext4_io_submit(io);
continue;
}
- if (buffer_new(bh)) {
+ if (buffer_new(bh))
clear_buffer_new(bh);
- clean_bdev_bh_alias(bh);
- }
set_buffer_async_write(bh);
nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
bh = head = page_buffers(page);
- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
- nr_to_submit) {
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
gfp_t gfp_flags = GFP_NOFS;
retry_encrypt:
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index e53639784892..3adadf461825 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -49,7 +49,7 @@
static inline bool ext4_bio_encrypted(struct bio *bio)
{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
return unlikely(bio->bi_private != NULL);
#else
return false;
@@ -243,8 +243,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
if (bio == NULL) {
struct fscrypt_ctx *ctx = NULL;
- if (ext4_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode)) {
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
ctx = fscrypt_get_ctx(inode, GFP_NOFS);
if (IS_ERR(ctx))
goto set_error_page;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 48421de803b7..e7ae26e36c9c 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -932,11 +932,18 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
n_group_desc[gdb_num] = gdb_bh;
+
+ BUFFER_TRACE(gdb_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+ if (err) {
+ kvfree(n_group_desc);
+ brelse(gdb_bh);
+ return err;
+ }
+
EXT4_SB(sb)->s_group_desc = n_group_desc;
EXT4_SB(sb)->s_gdb_count++;
kvfree(o_group_desc);
- BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
return err;
}
@@ -1960,7 +1967,8 @@ retry:
le16_to_cpu(es->s_reserved_gdt_blocks);
n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
n_blocks_count = (ext4_fsblk_t)n_group *
- EXT4_BLOCKS_PER_GROUP(sb);
+ EXT4_BLOCKS_PER_GROUP(sb) +
+ le32_to_cpu(es->s_first_data_block);
n_group--; /* set to last group number */
}
@@ -2072,6 +2080,10 @@ out:
free_flex_gd(flex_gd);
if (resize_inode != NULL)
iput(resize_inode);
- ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
+ if (err)
+ ext4_warning(sb, "error (%d) occurred during "
+ "file system resize", err);
+ ext4_msg(sb, KERN_INFO, "resized filesystem to %llu",
+ ext4_blocks_count(es));
return err;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fb12d3c17c1b..6ed4eb81e674 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -430,6 +430,12 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
spin_unlock(&sbi->s_md_lock);
}
+static bool system_going_down(void)
+{
+ return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
+ || system_state == SYSTEM_RESTART;
+}
+
/* Deal with the reporting of failure conditions on a filesystem such as
* inconsistencies detected or read IO failures.
*
@@ -460,7 +466,12 @@ static void ext4_handle_error(struct super_block *sb)
if (journal)
jbd2_journal_abort(journal, -EIO);
}
- if (test_opt(sb, ERRORS_RO)) {
+ /*
+ * We force ERRORS_RO behavior when system is rebooting. Otherwise we
+ * could panic during 'reboot -f' as the underlying device got already
+ * disabled.
+ */
+ if (test_opt(sb, ERRORS_RO) || system_going_down()) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
* Make sure updated value of ->s_mount_flags will be visible
@@ -468,8 +479,7 @@ static void ext4_handle_error(struct super_block *sb)
*/
smp_wmb();
sb->s_flags |= SB_RDONLY;
- }
- if (test_opt(sb, ERRORS_PANIC)) {
+ } else if (test_opt(sb, ERRORS_PANIC)) {
if (EXT4_SB(sb)->s_journal &&
!(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
return;
@@ -1232,7 +1242,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
return try_to_free_buffers(page);
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
{
return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
@@ -1922,7 +1932,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
*journal_ioprio =
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
} else if (token == Opt_test_dummy_encryption) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
ext4_msg(sb, KERN_WARNING,
"Test dummy encryption mode enabled");
@@ -2249,7 +2259,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
ext4_update_tstamp(es, s_mtime);
- ext4_update_dynamic_rev(sb);
if (sbi->s_journal)
ext4_set_feature_journal_needs_recovery(sb);
@@ -4167,7 +4176,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &ext4_sops;
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &ext4_cryptops;
#endif
#ifdef CONFIG_QUOTA
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9212a026a1f1..616c075da062 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -30,6 +30,7 @@ typedef enum {
attr_feature,
attr_pointer_ui,
attr_pointer_atomic,
+ attr_journal_task,
} attr_id_t;
typedef enum {
@@ -125,6 +126,14 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
return count;
}
+static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
+{
+ if (!sbi->s_journal)
+ return snprintf(buf, PAGE_SIZE, "<none>\n");
+ return snprintf(buf, PAGE_SIZE, "%d\n",
+ task_pid_vnr(sbi->s_journal->j_task));
+}
+
#define EXT4_ATTR(_name,_mode,_id) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -188,6 +197,7 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
+EXT4_ATTR(journal_task, 0444, journal_task);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -217,6 +227,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(errors_count),
ATTR_LIST(first_error_time),
ATTR_LIST(last_error_time),
+ ATTR_LIST(journal_task),
NULL,
};
@@ -224,7 +235,7 @@ static struct attribute *ext4_attrs[] = {
EXT4_ATTR_FEATURE(lazy_itable_init);
EXT4_ATTR_FEATURE(batched_discard);
EXT4_ATTR_FEATURE(meta_bg_resize);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
EXT4_ATTR_FEATURE(encryption);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
@@ -233,7 +244,7 @@ static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
ATTR_LIST(batched_discard),
ATTR_LIST(meta_bg_resize),
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
#endif
ATTR_LIST(metadata_csum_seed),
@@ -304,6 +315,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
return print_tstamp(buf, sbi->s_es, s_first_error_time);
case attr_last_error_time:
return print_tstamp(buf, sbi->s_es, s_last_error_time);
+ case attr_journal_task:
+ return journal_task_show(sbi, buf);
}
return 0;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 86ed9c686249..dc82e7757f67 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -829,6 +829,7 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
+ bh = NULL;
goto out;
}
@@ -2903,6 +2904,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
if (error == -EIO)
EXT4_ERROR_INODE(inode, "block %llu read error",
EXT4_I(inode)->i_file_acl);
+ bh = NULL;
goto cleanup;
}
error = ext4_xattr_check_block(inode, bh);
@@ -3059,6 +3061,7 @@ ext4_xattr_block_cache_find(struct inode *inode,
if (IS_ERR(bh)) {
if (PTR_ERR(bh) == -ENOMEM)
return NULL;
+ bh = NULL;
EXT4_ERROR_INODE(inode, "block %lu read error",
(unsigned long)ce->e_value);
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 9a20ef42fadd..e57cc754d543 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -3,6 +3,7 @@ config F2FS_FS
depends on BLOCK
select CRYPTO
select CRYPTO_CRC32
+ select F2FS_FS_XATTR if FS_ENCRYPTION
help
F2FS is based on Log-structured File System (LFS), which supports
versatile "flash-friendly" features. The design has been focused on
@@ -70,17 +71,6 @@ config F2FS_CHECK_FS
If you want to improve the performance, say N.
-config F2FS_FS_ENCRYPTION
- bool "F2FS Encryption"
- depends on F2FS_FS
- depends on F2FS_FS_XATTR
- select FS_ENCRYPTION
- help
- Enable encryption of f2fs files and directories. This
- feature is similar to ecryptfs, but it is more memory
- efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
-
config F2FS_IO_TRACE
bool "F2FS IO tracer"
depends on F2FS_FS
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f955cd3e0677..a98e1b02279e 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -306,8 +306,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
goto skip_write;
/* collect a number of dirty meta pages and write together */
- if (wbc->for_kupdate ||
- get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
+ if (wbc->sync_mode != WB_SYNC_ALL &&
+ get_pages(sbi, F2FS_DIRTY_META) <
+ nr_pages_to_skip(sbi, META))
goto skip_write;
/* if locked failed, cp will flush dirty pages instead */
@@ -405,7 +406,7 @@ static int f2fs_set_meta_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
return 1;
}
@@ -956,7 +957,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
inode_inc_dirty_pages(inode);
spin_unlock(&sbi->inode_lock[type]);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
}
@@ -1259,10 +1260,17 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
else
__clear_ckpt_flags(ckpt, CP_DISABLED_FLAG);
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK))
+ __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+
if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
- else
- __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+ /*
+ * TODO: we count on fsck.f2fs to clear this flag until we figure out
+ * missing cases which clear it incorrectly.
+ */
if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR))
__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index da060b77f64d..9727944139f2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -301,9 +301,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
for (; start < F2FS_IO_SIZE(sbi); start++) {
struct page *page =
mempool_alloc(sbi->write_io_dummy,
- GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL);
+ GFP_NOIO | __GFP_NOFAIL);
f2fs_bug_on(sbi, !page);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPagePrivate(page);
set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE);
lock_page(page);
@@ -1468,7 +1469,7 @@ next:
}
if (size) {
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
ret = fiemap_fill_next_extent(fieinfo, logical,
@@ -1553,6 +1554,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
if (last_block > last_block_in_file)
last_block = last_block_in_file;
+ /* just zeroing out page which is beyond EOF */
+ if (block_in_file >= last_block)
+ goto zero_out;
/*
* Map blocks using the previous result first.
*/
@@ -1565,16 +1569,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
* Then do more f2fs_map_blocks() calls until we are
* done with this page.
*/
- map.m_flags = 0;
-
- if (block_in_file < last_block) {
- map.m_lblk = block_in_file;
- map.m_len = last_block - block_in_file;
+ map.m_lblk = block_in_file;
+ map.m_len = last_block - block_in_file;
- if (f2fs_map_blocks(inode, &map, 0,
- F2FS_GET_BLOCK_DEFAULT))
- goto set_error_page;
- }
+ if (f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT))
+ goto set_error_page;
got_it:
if ((map.m_flags & F2FS_MAP_MAPPED)) {
block_nr = map.m_pblk + block_in_file - map.m_lblk;
@@ -1589,6 +1588,7 @@ got_it:
DATA_GENERIC))
goto set_error_page;
} else {
+zero_out:
zero_user_segment(page, 0, PAGE_SIZE);
if (!PageUptodate(page))
SetPageUptodate(page);
@@ -1739,7 +1739,7 @@ static inline bool check_inplace_update_policy(struct inode *inode,
if (policy & (0x1 << F2FS_IPU_ASYNC) &&
fio && fio->op == REQ_OP_WRITE &&
!(fio->op_flags & REQ_SYNC) &&
- !f2fs_encrypted_inode(inode))
+ !IS_ENCRYPTED(inode))
return true;
/* this is only set during fdatasync */
@@ -1863,8 +1863,13 @@ got_it:
if (fio->need_lock == LOCK_REQ)
f2fs_unlock_op(fio->sbi);
err = f2fs_inplace_write_data(fio);
- if (err && PageWriteback(page))
- end_page_writeback(page);
+ if (err) {
+ if (f2fs_encrypted_file(inode))
+ fscrypt_pullback_bio_page(&fio->encrypted_page,
+ true);
+ if (PageWriteback(page))
+ end_page_writeback(page);
+ }
trace_f2fs_do_write_data_page(fio->page, IPU);
set_inode_flag(inode, FI_UPDATE_WRITE);
return err;
@@ -2315,7 +2320,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
- f2fs_truncate_blocks(inode, i_size, true, true);
+ if (!IS_NOQUOTA(inode))
+ f2fs_truncate_blocks(inode, i_size, true);
up_write(&F2FS_I(inode)->i_mmap_sem);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -2585,14 +2591,11 @@ static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode,
{
struct f2fs_private_dio *dio;
bool write = (bio_op(bio) == REQ_OP_WRITE);
- int err;
dio = f2fs_kzalloc(F2FS_I_SB(inode),
sizeof(struct f2fs_private_dio), GFP_NOFS);
- if (!dio) {
- err = -ENOMEM;
+ if (!dio)
goto out;
- }
dio->inode = inode;
dio->orig_end_io = bio->bi_end_io;
@@ -2710,12 +2713,10 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
clear_cold_data(page);
- /* This is atomic written page, keep Private */
if (IS_ATOMIC_WRITTEN_PAGE(page))
return f2fs_drop_inmem_page(inode, page);
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
}
int f2fs_release_page(struct page *page, gfp_t wait)
@@ -2729,8 +2730,7 @@ int f2fs_release_page(struct page *page, gfp_t wait)
return 0;
clear_cold_data(page);
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
return 1;
}
@@ -2798,12 +2798,8 @@ int f2fs_migrate_page(struct address_space *mapping,
return -EAGAIN;
}
- /*
- * A reference is expected if PagePrivate set when move mapping,
- * however F2FS breaks this for maintaining dirty page counts when
- * truncating pages. So here adjusting the 'extra_count' make it work.
- */
- extra_count = (atomic_written ? 1 : 0) - page_has_private(page);
+ /* one extra reference was held for atomic_write page */
+ extra_count = atomic_written ? 1 : 0;
rc = migrate_page_move_mapping(mapping, newpage,
page, mode, extra_count);
if (rc != MIGRATEPAGE_SUCCESS) {
@@ -2824,9 +2820,10 @@ int f2fs_migrate_page(struct address_space *mapping,
get_page(newpage);
}
- if (PagePrivate(page))
- SetPagePrivate(newpage);
- set_page_private(newpage, page_private(page));
+ if (PagePrivate(page)) {
+ f2fs_set_page_private(newpage, page_private(page));
+ f2fs_clear_page_private(page);
+ }
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fd7f170e2f2d..99e9a5c37b71 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -96,8 +96,10 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->free_secs = free_sections(sbi);
si->prefree_count = prefree_segments(sbi);
si->dirty_count = dirty_segments(sbi);
- si->node_pages = NODE_MAPPING(sbi)->nrpages;
- si->meta_pages = META_MAPPING(sbi)->nrpages;
+ if (sbi->node_inode)
+ si->node_pages = NODE_MAPPING(sbi)->nrpages;
+ if (sbi->meta_inode)
+ si->meta_pages = META_MAPPING(sbi)->nrpages;
si->nats = NM_I(sbi)->nat_cnt;
si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
si->sits = MAIN_SEGS(sbi);
@@ -175,7 +177,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
static void update_mem_info(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- unsigned npages;
int i;
if (si->base_mem)
@@ -258,10 +259,14 @@ get_cache:
sizeof(struct extent_node);
si->page_mem = 0;
- npages = NODE_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
- npages = META_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ if (sbi->node_inode) {
+ unsigned npages = NODE_MAPPING(sbi)->nrpages;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ }
+ if (sbi->meta_inode) {
+ unsigned npages = META_MAPPING(sbi)->nrpages;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ }
}
static int stat_show(struct seq_file *s, void *v)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 50d0d36280fa..59bc46017855 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -385,7 +385,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
if (err)
goto put_error;
- if ((f2fs_encrypted_inode(dir) || dummy_encrypt) &&
+ if ((IS_ENCRYPTED(dir) || dummy_encrypt) &&
f2fs_may_encrypt(inode)) {
err = fscrypt_inherit_context(dir, inode, page, false);
if (err)
@@ -399,7 +399,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
if (new_name) {
init_dent_inode(new_name, page);
- if (f2fs_encrypted_inode(dir))
+ if (IS_ENCRYPTED(dir))
file_set_enc_name(inode);
}
@@ -728,7 +728,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
ClearPageUptodate(page);
clear_cold_data(page);
inode_dec_dirty_pages(dir);
@@ -800,6 +800,10 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
if (de->name_len == 0) {
bit_pos++;
ctx->pos = start_pos + bit_pos;
+ printk_ratelimited(
+ "%s, invalid namelen(0), ino:%u, run fsck to fix.",
+ KERN_WARNING, le32_to_cpu(de->ino));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
continue;
}
@@ -810,7 +814,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
/* check memory boundary before moving forward */
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
- if (unlikely(bit_pos > d->max)) {
+ if (unlikely(bit_pos > d->max ||
+ le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) {
f2fs_msg(sbi->sb, KERN_WARNING,
"%s: corrupted namelen=%d, run fsck to fix.",
__func__, le16_to_cpu(de->name_len));
@@ -819,7 +824,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
goto out;
}
- if (f2fs_encrypted_inode(d->inode)) {
+ if (IS_ENCRYPTED(d->inode)) {
int save_len = fstr->len;
err = fscrypt_fname_disk_to_usr(d->inode,
@@ -862,7 +867,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
int err = 0;
- if (f2fs_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_get_encryption_info(inode);
if (err && err != -ENOKEY)
goto out;
@@ -891,7 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
page_cache_sync_readahead(inode->i_mapping, ra, file, n,
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
- dentry_page = f2fs_get_lock_data_page(inode, n, false);
+ dentry_page = f2fs_find_data_page(inode, n);
if (IS_ERR(dentry_page)) {
err = PTR_ERR(dentry_page);
if (err == -ENOENT) {
@@ -909,11 +914,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
err = f2fs_fill_dentries(ctx, &d,
n * NR_DENTRY_IN_BLOCK, &fstr);
if (err) {
- f2fs_put_page(dentry_page, 1);
+ f2fs_put_page(dentry_page, 0);
break;
}
- f2fs_put_page(dentry_page, 1);
+ f2fs_put_page(dentry_page, 0);
}
out_free:
fscrypt_fname_free_buffer(&fstr);
@@ -924,7 +929,7 @@ out:
static int f2fs_dir_open(struct inode *inode, struct file *filp)
{
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
return 0;
}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 1cb0fcc67d2d..caf77fe8ac07 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -506,7 +506,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
unsigned int end = fofs + len;
unsigned int pos = (unsigned int)fofs;
bool updated = false;
- bool leftmost;
+ bool leftmost = false;
if (!et)
return;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 02d0cd199828..bacf5c2a8850 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,7 +24,6 @@
#include <linux/quotaops.h>
#include <crypto/hash.h>
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION)
#include <linux/fscrypt.h>
#ifdef CONFIG_F2FS_CHECK_FS
@@ -191,6 +190,8 @@ enum {
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
#define DEF_DISABLE_INTERVAL 5 /* 5 secs */
+#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */
+#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */
struct cp_control {
int reason;
@@ -254,7 +255,7 @@ struct discard_entry {
/* max discard pend list number */
#define MAX_PLIST_NUM 512
#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \
- (MAX_PLIST_NUM - 1) : (blk_num - 1))
+ (MAX_PLIST_NUM - 1) : ((blk_num) - 1))
enum {
D_PREP, /* initial */
@@ -310,6 +311,7 @@ struct discard_policy {
bool sync; /* submit discard with REQ_SYNC flag */
bool ordered; /* issue discard by lba order */
unsigned int granularity; /* discard granularity */
+ int timeout; /* discard timeout for put_super */
};
struct discard_cmd_control {
@@ -456,7 +458,6 @@ struct f2fs_flush_device {
/* for inline stuff */
#define DEF_INLINE_RESERVED_SIZE 1
-#define DEF_MIN_INLINE_SIZE 1
static inline int get_extra_isize(struct inode *inode);
static inline int get_inline_xattr_addrs(struct inode *inode);
#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \
@@ -1099,6 +1100,7 @@ enum {
SBI_IS_SHUTDOWN, /* shutdown by ioctl */
SBI_IS_RECOVERED, /* recovered orphan/data */
SBI_CP_DISABLED, /* CP was disabled last mount */
+ SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */
SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */
SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */
SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */
@@ -1110,6 +1112,7 @@ enum {
DISCARD_TIME,
GC_TIME,
DISABLE_TIME,
+ UMOUNT_DISCARD_TIMEOUT,
MAX_TIME,
};
@@ -1137,7 +1140,7 @@ enum fsync_mode {
FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */
};
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) \
(unlikely(F2FS_OPTION(sbi).test_dummy_encryption))
#else
@@ -1238,8 +1241,6 @@ struct f2fs_sb_info {
unsigned int nquota_files; /* # of quota sysfile */
- u32 s_next_generation; /* for NFS support */
-
/* # of pages, see count_type */
atomic_t nr_pages[NR_COUNT_TYPE];
/* # of allocated blocks */
@@ -1421,7 +1422,6 @@ static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc,
BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx));
desc.shash.tfm = sbi->s_chksum_driver;
- desc.shash.flags = 0;
*(u32 *)desc.ctx = crc;
err = crypto_shash_update(&desc.shash, address, length);
@@ -1799,13 +1799,12 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
{
atomic_inc(&sbi->nr_pages[count_type]);
- if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
- count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA ||
- count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE ||
- count_type == F2FS_RD_META)
- return;
-
- set_sbi_flag(sbi, SBI_IS_DIRTY);
+ if (count_type == F2FS_DIRTY_DENTS ||
+ count_type == F2FS_DIRTY_NODES ||
+ count_type == F2FS_DIRTY_META ||
+ count_type == F2FS_DIRTY_QDATA ||
+ count_type == F2FS_DIRTY_IMETA)
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
}
static inline void inode_inc_dirty_pages(struct inode *inode)
@@ -2157,10 +2156,17 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
get_pages(sbi, F2FS_WB_CP_DATA) ||
get_pages(sbi, F2FS_DIO_READ) ||
- get_pages(sbi, F2FS_DIO_WRITE) ||
- atomic_read(&SM_I(sbi)->dcc_info->queued_discard) ||
- atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
+ get_pages(sbi, F2FS_DIO_WRITE))
+ return false;
+
+ if (SM_I(sbi) && SM_I(sbi)->dcc_info &&
+ atomic_read(&SM_I(sbi)->dcc_info->queued_discard))
+ return false;
+
+ if (SM_I(sbi) && SM_I(sbi)->fcc_info &&
+ atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
return false;
+
return f2fs_time_over(sbi, type);
}
@@ -2301,11 +2307,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
#define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */
#define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+#define F2FS_NOCOW_FL 0x00800000 /* Do not cow file */
#define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */
+#define F2FS_FL_USER_VISIBLE 0x30CBDFFF /* User visible flags */
#define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */
/* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */
@@ -2762,9 +2769,9 @@ static inline int get_inline_xattr_addrs(struct inode *inode)
#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr))
#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \
- ((offsetof(typeof(*f2fs_inode), field) + \
+ ((offsetof(typeof(*(f2fs_inode)), field) + \
sizeof((f2fs_inode)->field)) \
- <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \
+ <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \
static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
{
@@ -2793,8 +2800,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1)
-#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \
- (!is_read_io(fio->op) || fio->is_meta))
+#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META && \
+ (!is_read_io((fio)->op) || (fio)->is_meta))
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
@@ -2826,13 +2833,33 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi,
return true;
}
+static inline void f2fs_set_page_private(struct page *page,
+ unsigned long data)
+{
+ if (PagePrivate(page))
+ return;
+
+ get_page(page);
+ SetPagePrivate(page);
+ set_page_private(page, data);
+}
+
+static inline void f2fs_clear_page_private(struct page *page)
+{
+ if (!PagePrivate(page))
+ return;
+
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ f2fs_put_page(page, 0);
+}
+
/*
* file.c
*/
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
void f2fs_truncate_data_blocks(struct dnode_of_data *dn);
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
- bool buf_write);
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock);
int f2fs_truncate(struct inode *inode);
int f2fs_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
@@ -3006,7 +3033,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi);
void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi);
-bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
+bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi);
void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
struct cp_control *cpc);
void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi);
@@ -3463,19 +3490,14 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi);
/*
* crypto support
*/
-static inline bool f2fs_encrypted_inode(struct inode *inode)
-{
- return file_is_encrypt(inode);
-}
-
static inline bool f2fs_encrypted_file(struct inode *inode)
{
- return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode);
+ return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
}
static inline void f2fs_set_encrypted_inode(struct inode *inode)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
file_set_encrypt(inode);
f2fs_set_inode_flags(inode);
#endif
@@ -3554,7 +3576,7 @@ static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
static inline bool f2fs_may_encrypt(struct inode *inode)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
umode_t mode = inode->i_mode;
return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
@@ -3616,8 +3638,6 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0)
#endif
-#endif
-
static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
{
#ifdef CONFIG_QUOTA
@@ -3630,3 +3650,5 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
#endif
return false;
}
+
+#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index bba56b39dcc5..5742ab8b57dc 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -582,15 +582,14 @@ truncate_out:
zero_user(page, offset, PAGE_SIZE - offset);
/* An encrypted inode should have a key and truncate the last page. */
- f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode));
+ f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode));
if (!cache_only)
set_page_dirty(page);
f2fs_put_page(page, 1);
return 0;
}
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
- bool buf_write)
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
@@ -598,7 +597,6 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
int count = 0, err = 0;
struct page *ipage;
bool truncate_page = false;
- int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO;
trace_f2fs_truncate_blocks_enter(inode, from);
@@ -608,7 +606,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
goto free_partial;
if (lock)
- __do_map_lock(sbi, flag, true);
+ f2fs_lock_op(sbi);
ipage = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
@@ -646,7 +644,7 @@ free_next:
err = f2fs_truncate_inode_blocks(inode, free_from);
out:
if (lock)
- __do_map_lock(sbi, flag, false);
+ f2fs_unlock_op(sbi);
free_partial:
/* lastly zero out the first data page */
if (!err)
@@ -681,7 +679,7 @@ int f2fs_truncate(struct inode *inode)
return err;
}
- err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false);
+ err = f2fs_truncate_blocks(inode, i_size_read(inode), true);
if (err)
return err;
@@ -711,7 +709,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
stat->attributes |= STATX_ATTR_APPEND;
if (flags & F2FS_COMPR_FL)
stat->attributes |= STATX_ATTR_COMPRESSED;
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
stat->attributes |= STATX_ATTR_ENCRYPTED;
if (flags & F2FS_IMMUTABLE_FL)
stat->attributes |= STATX_ATTR_IMMUTABLE;
@@ -768,7 +766,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int err;
- bool size_changed = false;
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
@@ -843,8 +840,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
down_write(&F2FS_I(inode)->i_sem);
F2FS_I(inode)->last_disk_size = i_size_read(inode);
up_write(&F2FS_I(inode)->i_sem);
-
- size_changed = true;
}
__setattr_copy(inode, attr);
@@ -858,7 +853,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
}
/* file size may changed here */
- f2fs_mark_inode_dirty_sync(inode, size_changed);
+ f2fs_mark_inode_dirty_sync(inode, true);
/* inode change will produce dirty node pages flushed by checkpoint */
f2fs_balance_fs(F2FS_I_SB(inode), true);
@@ -1262,7 +1257,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
new_size = i_size_read(inode) - len;
truncate_pagecache(inode, new_size);
- ret = f2fs_truncate_blocks(inode, new_size, true, false);
+ ret = f2fs_truncate_blocks(inode, new_size, true);
up_write(&F2FS_I(inode)->i_mmap_sem);
if (!ret)
f2fs_i_size_write(inode, new_size);
@@ -1447,7 +1442,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
down_write(&F2FS_I(inode)->i_mmap_sem);
- ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false);
+ ret = f2fs_truncate_blocks(inode, i_size_read(inode), true);
up_write(&F2FS_I(inode)->i_mmap_sem);
if (ret)
return ret;
@@ -1563,7 +1558,7 @@ static long f2fs_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode))
return -EINVAL;
- if (f2fs_encrypted_inode(inode) &&
+ if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
@@ -1647,10 +1642,12 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
struct f2fs_inode_info *fi = F2FS_I(inode);
unsigned int flags = fi->i_flags;
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
flags |= F2FS_ENCRYPT_FL;
if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
flags |= F2FS_INLINE_DATA_FL;
+ if (is_inode_flag_set(inode, FI_PIN_FILE))
+ flags |= F2FS_NOCOW_FL;
flags &= F2FS_FL_USER_VISIBLE;
@@ -1750,10 +1747,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- if (!get_dirty_pages(inode))
- goto skip_flush;
-
- f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
+ /*
+ * Should wait end_io to count F2FS_WB_CP_DATA correctly by
+ * f2fs_is_atomic_file.
+ */
+ if (get_dirty_pages(inode))
+ f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
"Unexpected flush for atomic writes: ino=%lu, npages=%u",
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
@@ -1761,7 +1760,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
-skip_flush:
+
set_inode_flag(inode, FI_ATOMIC_FILE);
clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -1968,11 +1967,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
break;
case F2FS_GOING_DOWN_NEED_FSCK:
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
/* do checkpoint only */
ret = f2fs_sync_fs(sb, 1);
- if (ret)
- goto out;
- break;
+ goto out;
default:
ret = -EINVAL;
goto out;
@@ -1988,6 +1987,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
out:
if (in != F2FS_GOING_DOWN_FULLSYNC)
mnt_drop_write_file(filp);
+
+ trace_f2fs_shutdown(sbi, in, ret);
+
return ret;
}
@@ -2414,7 +2416,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode))
return -EINVAL;
- if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
+ if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst))
return -EOPNOTSUPP;
if (src == dst) {
@@ -2871,8 +2873,8 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
__u32 pin;
int ret = 0;
- if (!inode_owner_or_capable(inode))
- return -EACCES;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
if (get_user(pin, (__u32 __user *)arg))
return -EFAULT;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index d636cbcf68f2..bb6a152310ef 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -298,7 +298,7 @@ process_inline:
clear_inode_flag(inode, FI_INLINE_DATA);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
- if (f2fs_truncate_blocks(inode, 0, false, false))
+ if (f2fs_truncate_blocks(inode, 0, false))
return false;
goto process_inline;
}
@@ -470,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry)
return 0;
punch_dentry_pages:
truncate_inode_pages(&dir->i_data, 0);
- f2fs_truncate_blocks(dir, 0, false, false);
+ f2fs_truncate_blocks(dir, 0, false);
f2fs_remove_dirty_inode(dir);
return err;
}
@@ -659,6 +659,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
if (IS_ERR(ipage))
return PTR_ERR(ipage);
+ /*
+ * f2fs_readdir was protected by inode.i_rwsem, it is safe to access
+ * ipage without page's lock held.
+ */
+ unlock_page(ipage);
+
inline_dentry = inline_data_addr(inode, ipage);
make_dentry_ptr_inline(inode, &d, inline_dentry);
@@ -667,7 +673,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
if (!err)
ctx->pos = d.max;
- f2fs_put_page(ipage, 1);
+ f2fs_put_page(ipage, 0);
return err < 0 ? err : 0;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index bec52961630b..e7f2e8759315 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -14,6 +14,7 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "xattr.h"
#include <trace/events/f2fs.h>
@@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & F2FS_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (f2fs_encrypted_inode(inode))
+ if (file_is_encrypt(inode))
new_fl |= S_ENCRYPTED;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
@@ -248,6 +249,20 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
+ if (f2fs_has_extra_attr(inode) &&
+ f2fs_sb_has_flexible_inline_xattr(sbi) &&
+ f2fs_has_inline_xattr(inode) &&
+ (!fi->i_inline_xattr_size ||
+ fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: inode (ino=%lx) has corrupted "
+ "i_inline_xattr_size: %d, max: %zu",
+ __func__, inode->i_ino, fi->i_inline_xattr_size,
+ MAX_INLINE_XATTR_SIZE);
+ return false;
+ }
+
if (F2FS_I(inode)->extent_tree) {
struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest;
@@ -453,7 +468,7 @@ make_now:
inode->i_mapping->a_ops = &f2fs_dblock_aops;
inode_nohighmem(inode);
} else if (S_ISLNK(inode->i_mode)) {
- if (f2fs_encrypted_inode(inode))
+ if (file_is_encrypt(inode))
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 62d9829f3a6a..f5e34e467003 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/ctype.h>
+#include <linux/random.h>
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
@@ -50,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
F2FS_I(inode)->i_crtime = inode->i_mtime;
- inode->i_generation = sbi->s_next_generation++;
+ inode->i_generation = prandom_u32();
if (S_ISDIR(inode->i_mode))
F2FS_I(inode)->i_current_depth = 1;
@@ -75,7 +76,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
set_inode_flag(inode, FI_NEW_INODE);
/* If the directory encrypted, then we should encrypt the inode. */
- if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+ if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
f2fs_may_encrypt(inode))
f2fs_set_encrypted_inode(inode);
@@ -476,7 +477,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
if (err)
goto out_iput;
}
- if (f2fs_encrypted_inode(dir) &&
+ if (IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
f2fs_msg(inode->i_sb, KERN_WARNING,
@@ -803,7 +804,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
- if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
+ if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
int err = fscrypt_get_encryption_info(dir);
if (err)
return err;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4f450e573312..3f99ab288695 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1920,7 +1920,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
f2fs_balance_fs_bg(sbi);
/* collect a number of dirty node pages and write together */
- if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
+ if (wbc->sync_mode != WB_SYNC_ALL &&
+ get_pages(sbi, F2FS_DIRTY_NODES) <
+ nr_pages_to_skip(sbi, NODE))
goto skip_write;
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -1959,7 +1961,7 @@ static int f2fs_set_node_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
return 1;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 9b79056d705d..aa7fe79b62b2 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -191,8 +191,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
f2fs_trace_pid(page);
- set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
@@ -215,7 +214,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
}
static int __revoke_inmem_pages(struct inode *inode,
- struct list_head *head, bool drop, bool recover)
+ struct list_head *head, bool drop, bool recover,
+ bool trylock)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct inmem_pages *cur, *tmp;
@@ -227,7 +227,16 @@ static int __revoke_inmem_pages(struct inode *inode,
if (drop)
trace_f2fs_commit_inmem_page(page, INMEM_DROP);
- lock_page(page);
+ if (trylock) {
+ /*
+ * to avoid deadlock in between page lock and
+ * inmem_lock.
+ */
+ if (!trylock_page(page))
+ continue;
+ } else {
+ lock_page(page);
+ }
f2fs_wait_on_page_writeback(page, DATA, true, true);
@@ -270,8 +279,7 @@ next:
ClearPageUptodate(page);
clear_cold_data(page);
}
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
f2fs_put_page(page, 1);
list_del(&cur->list);
@@ -318,13 +326,19 @@ void f2fs_drop_inmem_pages(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- mutex_lock(&fi->inmem_lock);
- __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- if (!list_empty(&fi->inmem_ilist))
- list_del_init(&fi->inmem_ilist);
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
- mutex_unlock(&fi->inmem_lock);
+ while (!list_empty(&fi->inmem_pages)) {
+ mutex_lock(&fi->inmem_lock);
+ __revoke_inmem_pages(inode, &fi->inmem_pages,
+ true, false, true);
+
+ if (list_empty(&fi->inmem_pages)) {
+ spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+ if (!list_empty(&fi->inmem_ilist))
+ list_del_init(&fi->inmem_ilist);
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ }
+ mutex_unlock(&fi->inmem_lock);
+ }
clear_inode_flag(inode, FI_ATOMIC_FILE);
fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
@@ -354,8 +368,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
kmem_cache_free(inmem_entry_slab, cur);
ClearPageUptodate(page);
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
f2fs_put_page(page, 0);
trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
@@ -429,12 +442,15 @@ retry:
* recovery or rewrite & commit last transaction. For other
* error number, revoking was done by filesystem itself.
*/
- err = __revoke_inmem_pages(inode, &revoke_list, false, true);
+ err = __revoke_inmem_pages(inode, &revoke_list,
+ false, true, false);
/* drop all uncommitted pages */
- __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+ __revoke_inmem_pages(inode, &fi->inmem_pages,
+ true, false, false);
} else {
- __revoke_inmem_pages(inode, &revoke_list, false, false);
+ __revoke_inmem_pages(inode, &revoke_list,
+ false, false, false);
}
return err;
@@ -542,9 +558,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
static int __submit_flush_wait(struct f2fs_sb_info *sbi,
struct block_device *bdev)
{
- struct bio *bio = f2fs_bio_alloc(sbi, 0, true);
+ struct bio *bio;
int ret;
+ bio = f2fs_bio_alloc(sbi, 0, false);
+ if (!bio)
+ return -ENOMEM;
+
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
bio_set_dev(bio, bdev);
ret = submit_bio_wait(bio);
@@ -868,6 +888,9 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi)
if (holes[DATA] > ovp || holes[NODE] > ovp)
return -EAGAIN;
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
+ dirty_segments(sbi) > overprovision_segments(sbi))
+ return -EAGAIN;
return 0;
}
@@ -1037,6 +1060,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
dpolicy->io_aware_gran = MAX_PLIST_NUM;
+ dpolicy->timeout = 0;
if (discard_type == DPOLICY_BG) {
dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
@@ -1059,6 +1083,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
} else if (discard_type == DPOLICY_UMOUNT) {
dpolicy->max_requests = UINT_MAX;
dpolicy->io_aware = false;
+ /* we need to issue all to keep CP_TRIMMED_FLAG */
+ dpolicy->granularity = 1;
}
}
@@ -1424,7 +1450,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
int i, issued = 0;
bool io_interrupted = false;
+ if (dpolicy->timeout != 0)
+ f2fs_update_time(sbi, dpolicy->timeout);
+
for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+ if (dpolicy->timeout != 0 &&
+ f2fs_time_over(sbi, dpolicy->timeout))
+ break;
+
if (i + 1 < dpolicy->granularity)
break;
@@ -1611,7 +1644,7 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi)
}
/* This comes from f2fs_put_super */
-bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
+bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct discard_policy dpolicy;
@@ -1619,6 +1652,7 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
dcc->discard_granularity);
+ dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT;
__issue_discard_cmd(sbi, &dpolicy);
dropped = __drop_discard_cmd(sbi);
@@ -3164,10 +3198,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
stat_inc_inplace_blocks(fio->sbi);
err = f2fs_submit_page_bio(fio);
- if (!err)
+ if (!err) {
update_device_state(fio);
-
- f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+ }
return err;
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index a77f76f528b6..5c7ed0442d6e 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -865,7 +865,7 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
}
}
mutex_unlock(&dcc->cmd_lock);
- if (!wakeup)
+ if (!wakeup || !is_idle(sbi, DISCARD_TIME))
return;
wake_up:
dcc->discard_wake = 1;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3d3ce9eb6d13..f2aaa2cc6b3e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -269,7 +269,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
if (!qname) {
f2fs_msg(sb, KERN_ERR,
"Not enough memory for storing quotafile name");
- return -EINVAL;
+ return -ENOMEM;
}
if (F2FS_OPTION(sbi).s_qf_names[qtype]) {
if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0)
@@ -586,7 +586,7 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_io_size_bits:
if (args->from && match_int(args, &arg))
return -EINVAL;
- if (arg > __ilog2_u32(BIO_MAX_PAGES)) {
+ if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) {
f2fs_msg(sb, KERN_WARNING,
"Not support %d, larger than %d",
1 << arg, BIO_MAX_PAGES);
@@ -757,7 +757,7 @@ static int parse_options(struct super_block *sb, char *options)
kvfree(name);
break;
case Opt_test_dummy_encryption:
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (!f2fs_sb_has_encrypt(sbi)) {
f2fs_msg(sb, KERN_ERR, "Encrypt feature is off");
return -EINVAL;
@@ -821,6 +821,8 @@ static int parse_options(struct super_block *sb, char *options)
}
if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+ int min_size, max_size;
+
if (!f2fs_sb_has_extra_attr(sbi) ||
!f2fs_sb_has_flexible_inline_xattr(sbi)) {
f2fs_msg(sb, KERN_ERR,
@@ -834,14 +836,15 @@ static int parse_options(struct super_block *sb, char *options)
"set with inline_xattr option");
return -EINVAL;
}
- if (!F2FS_OPTION(sbi).inline_xattr_size ||
- F2FS_OPTION(sbi).inline_xattr_size >=
- DEF_ADDRS_PER_INODE -
- F2FS_TOTAL_EXTRA_ATTR_SIZE -
- DEF_INLINE_RESERVED_SIZE -
- DEF_MIN_INLINE_SIZE) {
+
+ min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32);
+ max_size = MAX_INLINE_XATTR_SIZE;
+
+ if (F2FS_OPTION(sbi).inline_xattr_size < min_size ||
+ F2FS_OPTION(sbi).inline_xattr_size > max_size) {
f2fs_msg(sb, KERN_ERR,
- "inline xattr size is out of range");
+ "inline xattr size is out of range: %d ~ %d",
+ min_size, max_size);
return -EINVAL;
}
}
@@ -915,6 +918,10 @@ static int f2fs_drop_inode(struct inode *inode)
sb_start_intwrite(inode->i_sb);
f2fs_i_size_write(inode, 0);
+ f2fs_submit_merged_write_cond(F2FS_I_SB(inode),
+ inode, NULL, 0, DATA);
+ truncate_inode_pages_final(inode->i_mapping);
+
if (F2FS_HAS_BLOCKS(inode))
f2fs_truncate(inode);
@@ -1048,7 +1055,7 @@ static void f2fs_put_super(struct super_block *sb)
}
/* be sure to wait for any on-going discard commands */
- dropped = f2fs_wait_discard_bios(sbi);
+ dropped = f2fs_issue_discard_timeout(sbi);
if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
!sbi->discard_blks && !dropped) {
@@ -1075,7 +1082,10 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_bug_on(sbi, sbi->fsync_node_num);
iput(sbi->node_inode);
+ sbi->node_inode = NULL;
+
iput(sbi->meta_inode);
+ sbi->meta_inode = NULL;
/*
* iput() can update stat information, if f2fs_write_checkpoint()
@@ -1390,7 +1400,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",whint_mode=%s", "user-based");
else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
seq_printf(seq, ",whint_mode=%s", "fs-based");
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (F2FS_OPTION(sbi).test_dummy_encryption)
seq_puts(seq, ",test_dummy_encryption");
#endif
@@ -1455,9 +1465,16 @@ static int f2fs_enable_quotas(struct super_block *sb);
static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
{
+ unsigned int s_flags = sbi->sb->s_flags;
struct cp_control cpc;
- int err;
+ int err = 0;
+ int ret;
+ if (s_flags & SB_RDONLY) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "checkpoint=disable on readonly fs");
+ return -EINVAL;
+ }
sbi->sb->s_flags |= SB_ACTIVE;
f2fs_update_time(sbi, DISABLE_TIME);
@@ -1465,18 +1482,24 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
mutex_lock(&sbi->gc_mutex);
err = f2fs_gc(sbi, true, false, NULL_SEGNO);
- if (err == -ENODATA)
+ if (err == -ENODATA) {
+ err = 0;
break;
+ }
if (err && err != -EAGAIN)
- return err;
+ break;
}
- err = sync_filesystem(sbi->sb);
- if (err)
- return err;
+ ret = sync_filesystem(sbi->sb);
+ if (ret || err) {
+ err = ret ? ret: err;
+ goto restore_flag;
+ }
- if (f2fs_disable_cp_again(sbi))
- return -EAGAIN;
+ if (f2fs_disable_cp_again(sbi)) {
+ err = -EAGAIN;
+ goto restore_flag;
+ }
mutex_lock(&sbi->gc_mutex);
cpc.reason = CP_PAUSE;
@@ -1485,7 +1508,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
sbi->unusable_block_count = 0;
mutex_unlock(&sbi->gc_mutex);
- return 0;
+restore_flag:
+ sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ return err;
}
static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
@@ -2023,6 +2048,12 @@ void f2fs_quota_off_umount(struct super_block *sb)
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
}
}
+ /*
+ * In case of checkpoint=disable, we must flush quota blocks.
+ * This can cause NULL exception for node_inode in end_io, since
+ * put_super already dropped it.
+ */
+ sync_filesystem(sb);
}
static void f2fs_truncate_quota_inode_pages(struct super_block *sb)
@@ -2154,7 +2185,7 @@ static const struct super_operations f2fs_sops = {
.remount_fs = f2fs_remount,
};
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
{
return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
@@ -2703,6 +2734,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL;
+ sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] =
+ DEF_UMOUNT_DISCARD_TIMEOUT;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
for (i = 0; i < NR_COUNT_TYPE; i++)
@@ -3022,10 +3055,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
struct f2fs_super_block *raw_super;
struct inode *root;
int err;
- bool retry = true, need_fsck = false;
+ bool skip_recovery = false, need_fsck = false;
char *options = NULL;
int recovery, i, valid_super_block;
struct curseg_info *seg_i;
+ int retry_cnt = 1;
try_onemore:
err = -EINVAL;
@@ -3097,7 +3131,6 @@ try_onemore:
sb->s_maxbytes = sbi->max_file_blocks <<
le32_to_cpu(raw_super->log_blocksize);
sb->s_max_links = F2FS_LINK_MAX;
- get_random_bytes(&sbi->s_next_generation, sizeof(u32));
#ifdef CONFIG_QUOTA
sb->dq_op = &f2fs_quota_operations;
@@ -3116,7 +3149,7 @@ try_onemore:
#endif
sb->s_op = &f2fs_sops;
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &f2fs_cryptops;
#endif
sb->s_xattr = f2fs_xattr_handlers;
@@ -3200,6 +3233,10 @@ try_onemore:
if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+ if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) {
+ set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+ sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL;
+ }
/* Initialize device list */
err = f2fs_scan_devices(sbi);
@@ -3288,7 +3325,7 @@ try_onemore:
sb->s_root = d_make_root(root); /* allocate root dentry */
if (!sb->s_root) {
err = -ENOMEM;
- goto free_root_inode;
+ goto free_node_inode;
}
err = f2fs_register_sysfs(sbi);
@@ -3310,7 +3347,7 @@ try_onemore:
goto free_meta;
if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)))
- goto skip_recovery;
+ goto reset_checkpoint;
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
@@ -3327,11 +3364,13 @@ try_onemore:
if (need_fsck)
set_sbi_flag(sbi, SBI_NEED_FSCK);
- if (!retry)
- goto skip_recovery;
+ if (skip_recovery)
+ goto reset_checkpoint;
err = f2fs_recover_fsync_data(sbi, false);
if (err < 0) {
+ if (err != -ENOMEM)
+ skip_recovery = true;
need_fsck = true;
f2fs_msg(sb, KERN_ERR,
"Cannot recover all fsync data errno=%d", err);
@@ -3347,14 +3386,14 @@ try_onemore:
goto free_meta;
}
}
-skip_recovery:
+reset_checkpoint:
/* f2fs_recover_fsync_data() cleared this already */
clear_sbi_flag(sbi, SBI_POR_DOING);
if (test_opt(sbi, DISABLE_CHECKPOINT)) {
err = f2fs_disable_checkpoint(sbi);
if (err)
- goto free_meta;
+ goto sync_free_meta;
} else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) {
f2fs_enable_checkpoint(sbi);
}
@@ -3367,7 +3406,7 @@ skip_recovery:
/* After POR, we can run background GC thread.*/
err = f2fs_start_gc_thread(sbi);
if (err)
- goto free_meta;
+ goto sync_free_meta;
}
kvfree(options);
@@ -3387,8 +3426,14 @@ skip_recovery:
cur_cp_version(F2FS_CKPT(sbi)));
f2fs_update_time(sbi, CP_TIME);
f2fs_update_time(sbi, REQ_TIME);
+ clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
return 0;
+sync_free_meta:
+ /* safe to flush all the data */
+ sync_filesystem(sbi->sb);
+ retry_cnt = 0;
+
free_meta:
#ifdef CONFIG_QUOTA
f2fs_truncate_quota_inode_pages(sb);
@@ -3402,6 +3447,8 @@ free_meta:
* falls into an infinite loop in f2fs_sync_meta_pages().
*/
truncate_inode_pages_final(META_MAPPING(sbi));
+ /* evict some inodes being cached by GC */
+ evict_inodes(sb);
f2fs_unregister_sysfs(sbi);
free_root_inode:
dput(sb->s_root);
@@ -3410,6 +3457,7 @@ free_node_inode:
f2fs_release_ino_entry(sbi, true);
truncate_inode_pages_final(NODE_MAPPING(sbi));
iput(sbi->node_inode);
+ sbi->node_inode = NULL;
free_stats:
f2fs_destroy_stats(sbi);
free_nm:
@@ -3422,6 +3470,7 @@ free_devices:
free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
+ sbi->meta_inode = NULL;
free_io_dummy:
mempool_destroy(sbi->write_io_dummy);
free_percpu:
@@ -3443,8 +3492,8 @@ free_sbi:
kvfree(sbi);
/* give only one another chance */
- if (retry) {
- retry = false;
+ if (retry_cnt > 0 && skip_recovery) {
+ retry_cnt--;
shrink_dcache_sb(sb);
goto try_onemore;
}
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 0575edbe3ed6..729f46a3c9ee 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -222,6 +222,8 @@ out:
#ifdef CONFIG_F2FS_FAULT_INJECTION
if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
return -EINVAL;
+ if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX)
+ return -EINVAL;
#endif
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
@@ -278,10 +280,16 @@ out:
return count;
}
- *ui = t;
- if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0)
- f2fs_reset_iostat(sbi);
+ if (!strcmp(a->attr.name, "iostat_enable")) {
+ sbi->iostat_enable = !!t;
+ if (!sbi->iostat_enable)
+ f2fs_reset_iostat(sbi);
+ return count;
+ }
+
+ *ui = (unsigned int)t;
+
return count;
}
@@ -418,6 +426,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval,
interval_time[DISCARD_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info,
+ umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
@@ -431,7 +441,7 @@ F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
F2FS_GENERAL_RO_ATTR(features);
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
#endif
#ifdef CONFIG_BLK_DEV_ZONED
@@ -475,6 +485,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(idle_interval),
ATTR_LIST(discard_idle_interval),
ATTR_LIST(gc_idle_interval),
+ ATTR_LIST(umount_discard_timeout),
ATTR_LIST(iostat_enable),
ATTR_LIST(readdir_ra),
ATTR_LIST(gc_pin_file_thresh),
@@ -492,7 +503,7 @@ static struct attribute *f2fs_attrs[] = {
};
static struct attribute *f2fs_feat_attrs[] = {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
#endif
#ifdef CONFIG_BLK_DEV_ZONED
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index ce2a5eb210b6..d0ab533a9ce8 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -14,7 +14,7 @@
#include "trace.h"
static RADIX_TREE(pids, GFP_ATOMIC);
-static struct mutex pids_lock;
+static spinlock_t pids_lock;
static struct last_io_info last_io;
static inline void __print_last_io(void)
@@ -58,23 +58,29 @@ void f2fs_trace_pid(struct page *page)
set_page_private(page, (unsigned long)pid);
+retry:
if (radix_tree_preload(GFP_NOFS))
return;
- mutex_lock(&pids_lock);
+ spin_lock(&pids_lock);
p = radix_tree_lookup(&pids, pid);
if (p == current)
goto out;
if (p)
radix_tree_delete(&pids, pid);
- f2fs_radix_tree_insert(&pids, pid, current);
+ if (radix_tree_insert(&pids, pid, current)) {
+ spin_unlock(&pids_lock);
+ radix_tree_preload_end();
+ cond_resched();
+ goto retry;
+ }
trace_printk("%3x:%3x %4x %-16s\n",
MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
pid, current->comm);
out:
- mutex_unlock(&pids_lock);
+ spin_unlock(&pids_lock);
radix_tree_preload_end();
}
@@ -119,7 +125,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
void f2fs_build_trace_ios(void)
{
- mutex_init(&pids_lock);
+ spin_lock_init(&pids_lock);
}
#define PIDVEC_SIZE 128
@@ -147,7 +153,7 @@ void f2fs_destroy_trace_ios(void)
pid_t next_pid = 0;
unsigned int found;
- mutex_lock(&pids_lock);
+ spin_lock(&pids_lock);
while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
unsigned idx;
@@ -155,5 +161,5 @@ void f2fs_destroy_trace_ios(void)
for (idx = 0; idx < found; idx++)
radix_tree_delete(&pids, pid[idx]);
}
- mutex_unlock(&pids_lock);
+ spin_unlock(&pids_lock);
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 18d5ffbc5e8c..848a785abe25 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -224,11 +224,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
{
struct f2fs_xattr_entry *entry;
unsigned int inline_size = inline_xattr_size(inode);
+ void *max_addr = base_addr + inline_size;
list_for_each_xattr(entry, base_addr) {
- if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
- (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) >
- base_addr + inline_size) {
+ if ((void *)entry + sizeof(__u32) > max_addr ||
+ (void *)XATTR_NEXT_ENTRY(entry) > max_addr) {
*last_addr = entry;
return NULL;
}
@@ -239,6 +239,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
if (!memcmp(entry->e_name, name, len))
break;
}
+
+ /* inline xattr header or entry across max inline xattr size */
+ if (IS_XATTR_LAST_ENTRY(entry) &&
+ (void *)entry + sizeof(__u32) > max_addr) {
+ *last_addr = entry;
+ return NULL;
+ }
return entry;
}
@@ -340,7 +347,7 @@ check:
*base_addr = txattr_addr;
return 0;
out:
- kzfree(txattr_addr);
+ kvfree(txattr_addr);
return err;
}
@@ -383,7 +390,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
*base_addr = txattr_addr;
return 0;
fail:
- kzfree(txattr_addr);
+ kvfree(txattr_addr);
return err;
}
@@ -510,7 +517,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
}
error = size;
out:
- kzfree(base_addr);
+ kvfree(base_addr);
return error;
}
@@ -538,7 +545,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (!handler || (handler->list && !handler->list(dentry)))
continue;
- prefix = handler->prefix ?: handler->name;
+ prefix = xattr_prefix(handler);
prefix_len = strlen(prefix);
size = prefix_len + entry->e_name_len + 1;
if (buffer) {
@@ -556,7 +563,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
}
error = buffer_size - rest;
cleanup:
- kzfree(base_addr);
+ kvfree(base_addr);
return error;
}
@@ -687,7 +694,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (!error && S_ISDIR(inode->i_mode))
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
exit:
- kzfree(base_addr);
+ kvfree(base_addr);
return error;
}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 67db134da0f5..9172ee082ca8 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -78,6 +78,12 @@ struct f2fs_xattr_entry {
sizeof(struct f2fs_xattr_header) - \
sizeof(struct f2fs_xattr_entry))
+#define MAX_INLINE_XATTR_SIZE \
+ (DEF_ADDRS_PER_INODE - \
+ F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \
+ DEF_INLINE_RESERVED_SIZE - \
+ MIN_INLINE_DENTRY_SIZE / sizeof(__le32))
+
/*
* On-disk structure of f2fs_xattr
* We use inline xattrs space + 1 block for xattr.
diff --git a/fs/filesystems.c b/fs/filesystems.c
index b03f57b1105b..9135646e41ac 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/fs_parser.h>
/*
* Handling of filesystem drivers list.
@@ -73,6 +74,9 @@ int register_filesystem(struct file_system_type * fs)
int res = 0;
struct file_system_type ** p;
+ if (fs->parameters && !fs_validate_description(fs->parameters))
+ return -EINVAL;
+
BUG_ON(strchr(fs->name, '.'));
if (fs->next)
return -EBUSY;
diff --git a/fs/fs_context.c b/fs/fs_context.c
new file mode 100644
index 000000000000..87e3546b9a52
--- /dev/null
+++ b/fs/fs_context.c
@@ -0,0 +1,642 @@
+/* Provide a way to create a superblock configuration context within the kernel
+ * that allows a superblock to be set up prior to mounting.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include <linux/security.h>
+#include <linux/mnt_namespace.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include <net/net_namespace.h>
+#include "mount.h"
+#include "internal.h"
+
+enum legacy_fs_param {
+ LEGACY_FS_UNSET_PARAMS,
+ LEGACY_FS_MONOLITHIC_PARAMS,
+ LEGACY_FS_INDIVIDUAL_PARAMS,
+};
+
+struct legacy_fs_context {
+ char *legacy_data; /* Data page for legacy filesystems */
+ size_t data_size;
+ enum legacy_fs_param param_type;
+};
+
+static int legacy_init_fs_context(struct fs_context *fc);
+
+static const struct constant_table common_set_sb_flag[] = {
+ { "dirsync", SB_DIRSYNC },
+ { "lazytime", SB_LAZYTIME },
+ { "mand", SB_MANDLOCK },
+ { "posixacl", SB_POSIXACL },
+ { "ro", SB_RDONLY },
+ { "sync", SB_SYNCHRONOUS },
+};
+
+static const struct constant_table common_clear_sb_flag[] = {
+ { "async", SB_SYNCHRONOUS },
+ { "nolazytime", SB_LAZYTIME },
+ { "nomand", SB_MANDLOCK },
+ { "rw", SB_RDONLY },
+ { "silent", SB_SILENT },
+};
+
+static const char *const forbidden_sb_flag[] = {
+ "bind",
+ "dev",
+ "exec",
+ "move",
+ "noatime",
+ "nodev",
+ "nodiratime",
+ "noexec",
+ "norelatime",
+ "nostrictatime",
+ "nosuid",
+ "private",
+ "rec",
+ "relatime",
+ "remount",
+ "shared",
+ "slave",
+ "strictatime",
+ "suid",
+ "unbindable",
+};
+
+/*
+ * Check for a common mount option that manipulates s_flags.
+ */
+static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
+{
+ unsigned int token;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++)
+ if (strcmp(key, forbidden_sb_flag[i]) == 0)
+ return -EINVAL;
+
+ token = lookup_constant(common_set_sb_flag, key, 0);
+ if (token) {
+ fc->sb_flags |= token;
+ fc->sb_flags_mask |= token;
+ return 0;
+ }
+
+ token = lookup_constant(common_clear_sb_flag, key, 0);
+ if (token) {
+ fc->sb_flags &= ~token;
+ fc->sb_flags_mask |= token;
+ return 0;
+ }
+
+ return -ENOPARAM;
+}
+
+/**
+ * vfs_parse_fs_param - Add a single parameter to a superblock config
+ * @fc: The filesystem context to modify
+ * @param: The parameter
+ *
+ * A single mount option in string form is applied to the filesystem context
+ * being set up. Certain standard options (for example "ro") are translated
+ * into flag bits without going to the filesystem. The active security module
+ * is allowed to observe and poach options. Any other options are passed over
+ * to the filesystem to parse.
+ *
+ * This may be called multiple times for a context.
+ *
+ * Returns 0 on success and a negative error code on failure. In the event of
+ * failure, supplementary error information may have been set.
+ */
+int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ int ret;
+
+ if (!param->key)
+ return invalf(fc, "Unnamed parameter\n");
+
+ ret = vfs_parse_sb_flag(fc, param->key);
+ if (ret != -ENOPARAM)
+ return ret;
+
+ ret = security_fs_context_parse_param(fc, param);
+ if (ret != -ENOPARAM)
+ /* Param belongs to the LSM or is disallowed by the LSM; so
+ * don't pass to the FS.
+ */
+ return ret;
+
+ if (fc->ops->parse_param) {
+ ret = fc->ops->parse_param(fc, param);
+ if (ret != -ENOPARAM)
+ return ret;
+ }
+
+ /* If the filesystem doesn't take any arguments, give it the
+ * default handling of source.
+ */
+ if (strcmp(param->key, "source") == 0) {
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "VFS: Non-string source");
+ if (fc->source)
+ return invalf(fc, "VFS: Multiple sources");
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+ }
+
+ return invalf(fc, "%s: Unknown parameter '%s'",
+ fc->fs_type->name, param->key);
+}
+EXPORT_SYMBOL(vfs_parse_fs_param);
+
+/**
+ * vfs_parse_fs_string - Convenience function to just parse a string.
+ */
+int vfs_parse_fs_string(struct fs_context *fc, const char *key,
+ const char *value, size_t v_size)
+{
+ int ret;
+
+ struct fs_parameter param = {
+ .key = key,
+ .type = fs_value_is_string,
+ .size = v_size,
+ };
+
+ if (v_size > 0) {
+ param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
+ if (!param.string)
+ return -ENOMEM;
+ }
+
+ ret = vfs_parse_fs_param(fc, &param);
+ kfree(param.string);
+ return ret;
+}
+EXPORT_SYMBOL(vfs_parse_fs_string);
+
+/**
+ * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
+ * @ctx: The superblock configuration to fill in.
+ * @data: The data to parse
+ *
+ * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be
+ * called from the ->monolithic_mount_data() fs_context operation.
+ *
+ * Returns 0 on success or the error returned by the ->parse_option() fs_context
+ * operation on failure.
+ */
+int generic_parse_monolithic(struct fs_context *fc, void *data)
+{
+ char *options = data, *key;
+ int ret = 0;
+
+ if (!options)
+ return 0;
+
+ ret = security_sb_eat_lsm_opts(options, &fc->security);
+ if (ret)
+ return ret;
+
+ while ((key = strsep(&options, ",")) != NULL) {
+ if (*key) {
+ size_t v_len = 0;
+ char *value = strchr(key, '=');
+
+ if (value) {
+ if (value == key)
+ continue;
+ *value++ = 0;
+ v_len = strlen(value);
+ }
+ ret = vfs_parse_fs_string(fc, key, value, v_len);
+ if (ret < 0)
+ break;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(generic_parse_monolithic);
+
+/**
+ * alloc_fs_context - Create a filesystem context.
+ * @fs_type: The filesystem type.
+ * @reference: The dentry from which this one derives (or NULL)
+ * @sb_flags: Filesystem/superblock flags (SB_*)
+ * @sb_flags_mask: Applicable members of @sb_flags
+ * @purpose: The purpose that this configuration shall be used for.
+ *
+ * Open a filesystem and create a mount context. The mount context is
+ * initialised with the supplied flags and, if a submount/automount from
+ * another superblock (referred to by @reference) is supplied, may have
+ * parameters such as namespaces copied across from that superblock.
+ */
+static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
+ struct dentry *reference,
+ unsigned int sb_flags,
+ unsigned int sb_flags_mask,
+ enum fs_context_purpose purpose)
+{
+ int (*init_fs_context)(struct fs_context *);
+ struct fs_context *fc;
+ int ret = -ENOMEM;
+
+ fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+ if (!fc)
+ return ERR_PTR(-ENOMEM);
+
+ fc->purpose = purpose;
+ fc->sb_flags = sb_flags;
+ fc->sb_flags_mask = sb_flags_mask;
+ fc->fs_type = get_filesystem(fs_type);
+ fc->cred = get_current_cred();
+ fc->net_ns = get_net(current->nsproxy->net_ns);
+
+ switch (purpose) {
+ case FS_CONTEXT_FOR_MOUNT:
+ fc->user_ns = get_user_ns(fc->cred->user_ns);
+ break;
+ case FS_CONTEXT_FOR_SUBMOUNT:
+ fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
+ break;
+ case FS_CONTEXT_FOR_RECONFIGURE:
+ /* We don't pin any namespaces as the superblock's
+ * subscriptions cannot be changed at this point.
+ */
+ atomic_inc(&reference->d_sb->s_active);
+ fc->root = dget(reference);
+ break;
+ }
+
+ /* TODO: Make all filesystems support this unconditionally */
+ init_fs_context = fc->fs_type->init_fs_context;
+ if (!init_fs_context)
+ init_fs_context = legacy_init_fs_context;
+
+ ret = init_fs_context(fc);
+ if (ret < 0)
+ goto err_fc;
+ fc->need_free = true;
+ return fc;
+
+err_fc:
+ put_fs_context(fc);
+ return ERR_PTR(ret);
+}
+
+struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
+ unsigned int sb_flags)
+{
+ return alloc_fs_context(fs_type, NULL, sb_flags, 0,
+ FS_CONTEXT_FOR_MOUNT);
+}
+EXPORT_SYMBOL(fs_context_for_mount);
+
+struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
+ unsigned int sb_flags,
+ unsigned int sb_flags_mask)
+{
+ return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags,
+ sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE);
+}
+EXPORT_SYMBOL(fs_context_for_reconfigure);
+
+struct fs_context *fs_context_for_submount(struct file_system_type *type,
+ struct dentry *reference)
+{
+ return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
+}
+EXPORT_SYMBOL(fs_context_for_submount);
+
+void fc_drop_locked(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ dput(fc->root);
+ fc->root = NULL;
+ deactivate_locked_super(sb);
+}
+
+static void legacy_fs_context_free(struct fs_context *fc);
+
+/**
+ * vfs_dup_fc_config: Duplicate a filesystem context.
+ * @src_fc: The context to copy.
+ */
+struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
+{
+ struct fs_context *fc;
+ int ret;
+
+ if (!src_fc->ops->dup)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL);
+ if (!fc)
+ return ERR_PTR(-ENOMEM);
+
+ fc->fs_private = NULL;
+ fc->s_fs_info = NULL;
+ fc->source = NULL;
+ fc->security = NULL;
+ get_filesystem(fc->fs_type);
+ get_net(fc->net_ns);
+ get_user_ns(fc->user_ns);
+ get_cred(fc->cred);
+
+ /* Can't call put until we've called ->dup */
+ ret = fc->ops->dup(fc, src_fc);
+ if (ret < 0)
+ goto err_fc;
+
+ ret = security_fs_context_dup(fc, src_fc);
+ if (ret < 0)
+ goto err_fc;
+ return fc;
+
+err_fc:
+ put_fs_context(fc);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(vfs_dup_fs_context);
+
+#ifdef CONFIG_PRINTK
+/**
+ * logfc - Log a message to a filesystem context
+ * @fc: The filesystem context to log to.
+ * @fmt: The format of the buffer.
+ */
+void logfc(struct fs_context *fc, const char *fmt, ...)
+{
+ va_list va;
+
+ va_start(va, fmt);
+
+ switch (fmt[0]) {
+ case 'w':
+ vprintk_emit(0, LOGLEVEL_WARNING, NULL, 0, fmt, va);
+ break;
+ case 'e':
+ vprintk_emit(0, LOGLEVEL_ERR, NULL, 0, fmt, va);
+ break;
+ default:
+ vprintk_emit(0, LOGLEVEL_NOTICE, NULL, 0, fmt, va);
+ break;
+ }
+
+ pr_cont("\n");
+ va_end(va);
+}
+EXPORT_SYMBOL(logfc);
+#endif
+
+/**
+ * put_fs_context - Dispose of a superblock configuration context.
+ * @fc: The context to dispose of.
+ */
+void put_fs_context(struct fs_context *fc)
+{
+ struct super_block *sb;
+
+ if (fc->root) {
+ sb = fc->root->d_sb;
+ dput(fc->root);
+ fc->root = NULL;
+ deactivate_super(sb);
+ }
+
+ if (fc->need_free && fc->ops && fc->ops->free)
+ fc->ops->free(fc);
+
+ security_free_mnt_opts(&fc->security);
+ put_net(fc->net_ns);
+ put_user_ns(fc->user_ns);
+ put_cred(fc->cred);
+ kfree(fc->subtype);
+ put_filesystem(fc->fs_type);
+ kfree(fc->source);
+ kfree(fc);
+}
+EXPORT_SYMBOL(put_fs_context);
+
+/*
+ * Free the config for a filesystem that doesn't support fs_context.
+ */
+static void legacy_fs_context_free(struct fs_context *fc)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+
+ if (ctx) {
+ if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS)
+ kfree(ctx->legacy_data);
+ kfree(ctx);
+ }
+}
+
+/*
+ * Duplicate a legacy config.
+ */
+static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
+{
+ struct legacy_fs_context *ctx;
+ struct legacy_fs_context *src_ctx = src_fc->fs_private;
+
+ ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) {
+ ctx->legacy_data = kmemdup(src_ctx->legacy_data,
+ src_ctx->data_size, GFP_KERNEL);
+ if (!ctx->legacy_data) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ }
+
+ fc->fs_private = ctx;
+ return 0;
+}
+
+/*
+ * Add a parameter to a legacy config. We build up a comma-separated list of
+ * options.
+ */
+static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+ unsigned int size = ctx->data_size;
+ size_t len = 0;
+
+ if (strcmp(param->key, "source") == 0) {
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "VFS: Legacy: Non-string source");
+ if (fc->source)
+ return invalf(fc, "VFS: Legacy: Multiple sources");
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+ }
+
+ if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) &&
+ strcmp(param->key, "subtype") == 0) {
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "VFS: Legacy: Non-string subtype");
+ if (fc->subtype)
+ return invalf(fc, "VFS: Legacy: Multiple subtype");
+ fc->subtype = param->string;
+ param->string = NULL;
+ return 0;
+ }
+
+ if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
+ return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
+
+ switch (param->type) {
+ case fs_value_is_string:
+ len = 1 + param->size;
+ /* Fall through */
+ case fs_value_is_flag:
+ len += strlen(param->key);
+ break;
+ default:
+ return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported",
+ param->key);
+ }
+
+ if (len > PAGE_SIZE - 2 - size)
+ return invalf(fc, "VFS: Legacy: Cumulative options too large");
+ if (strchr(param->key, ',') ||
+ (param->type == fs_value_is_string &&
+ memchr(param->string, ',', param->size)))
+ return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
+ param->key);
+ if (!ctx->legacy_data) {
+ ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!ctx->legacy_data)
+ return -ENOMEM;
+ }
+
+ ctx->legacy_data[size++] = ',';
+ len = strlen(param->key);
+ memcpy(ctx->legacy_data + size, param->key, len);
+ size += len;
+ if (param->type == fs_value_is_string) {
+ ctx->legacy_data[size++] = '=';
+ memcpy(ctx->legacy_data + size, param->string, param->size);
+ size += param->size;
+ }
+ ctx->legacy_data[size] = '\0';
+ ctx->data_size = size;
+ ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
+ return 0;
+}
+
+/*
+ * Add monolithic mount data.
+ */
+static int legacy_parse_monolithic(struct fs_context *fc, void *data)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+
+ if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) {
+ pr_warn("VFS: Can't mix monolithic and individual options\n");
+ return -EINVAL;
+ }
+
+ ctx->legacy_data = data;
+ ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS;
+ if (!ctx->legacy_data)
+ return 0;
+
+ if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
+ return 0;
+ return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
+}
+
+/*
+ * Get a mountable root with the legacy mount command.
+ */
+static int legacy_get_tree(struct fs_context *fc)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+ struct super_block *sb;
+ struct dentry *root;
+
+ root = fc->fs_type->mount(fc->fs_type, fc->sb_flags,
+ fc->source, ctx->legacy_data);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+
+ sb = root->d_sb;
+ BUG_ON(!sb);
+
+ fc->root = root;
+ return 0;
+}
+
+/*
+ * Handle remount.
+ */
+static int legacy_reconfigure(struct fs_context *fc)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
+
+ if (!sb->s_op->remount_fs)
+ return 0;
+
+ return sb->s_op->remount_fs(sb, &fc->sb_flags,
+ ctx ? ctx->legacy_data : NULL);
+}
+
+const struct fs_context_operations legacy_fs_context_ops = {
+ .free = legacy_fs_context_free,
+ .dup = legacy_fs_context_dup,
+ .parse_param = legacy_parse_param,
+ .parse_monolithic = legacy_parse_monolithic,
+ .get_tree = legacy_get_tree,
+ .reconfigure = legacy_reconfigure,
+};
+
+/*
+ * Initialise a legacy context for a filesystem that doesn't support
+ * fs_context.
+ */
+static int legacy_init_fs_context(struct fs_context *fc)
+{
+ fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+ if (!fc->fs_private)
+ return -ENOMEM;
+ fc->ops = &legacy_fs_context_ops;
+ return 0;
+}
+
+int parse_monolithic_mount_data(struct fs_context *fc, void *data)
+{
+ int (*monolithic_mount_data)(struct fs_context *, void *);
+
+ monolithic_mount_data = fc->ops->parse_monolithic;
+ if (!monolithic_mount_data)
+ monolithic_mount_data = generic_parse_monolithic;
+
+ return monolithic_mount_data(fc, data);
+}
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
new file mode 100644
index 000000000000..570d71043acf
--- /dev/null
+++ b/fs/fs_parser.c
@@ -0,0 +1,447 @@
+/* Filesystem parameter parser.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+static const struct constant_table bool_names[] = {
+ { "0", false },
+ { "1", true },
+ { "false", false },
+ { "no", false },
+ { "true", true },
+ { "yes", true },
+};
+
+/**
+ * lookup_constant - Look up a constant by name in an ordered table
+ * @tbl: The table of constants to search.
+ * @tbl_size: The size of the table.
+ * @name: The name to look up.
+ * @not_found: The value to return if the name is not found.
+ */
+int __lookup_constant(const struct constant_table *tbl, size_t tbl_size,
+ const char *name, int not_found)
+{
+ unsigned int i;
+
+ for (i = 0; i < tbl_size; i++)
+ if (strcmp(name, tbl[i].name) == 0)
+ return tbl[i].value;
+
+ return not_found;
+}
+EXPORT_SYMBOL(__lookup_constant);
+
+static const struct fs_parameter_spec *fs_lookup_key(
+ const struct fs_parameter_description *desc,
+ const char *name)
+{
+ const struct fs_parameter_spec *p;
+
+ if (!desc->specs)
+ return NULL;
+
+ for (p = desc->specs; p->name; p++)
+ if (strcmp(p->name, name) == 0)
+ return p;
+
+ return NULL;
+}
+
+/*
+ * fs_parse - Parse a filesystem configuration parameter
+ * @fc: The filesystem context to log errors through.
+ * @desc: The parameter description to use.
+ * @param: The parameter.
+ * @result: Where to place the result of the parse
+ *
+ * Parse a filesystem configuration parameter and attempt a conversion for a
+ * simple parameter for which this is requested. If successful, the determined
+ * parameter ID is placed into @result->key, the desired type is indicated in
+ * @result->t and any converted value is placed into an appropriate member of
+ * the union in @result.
+ *
+ * The function returns the parameter number if the parameter was matched,
+ * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that
+ * unknown parameters are okay and -EINVAL if there was a conversion issue or
+ * the parameter wasn't recognised and unknowns aren't okay.
+ */
+int fs_parse(struct fs_context *fc,
+ const struct fs_parameter_description *desc,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ const struct fs_parameter_spec *p;
+ const struct fs_parameter_enum *e;
+ int ret = -ENOPARAM, b;
+
+ result->has_value = !!param->string;
+ result->negated = false;
+ result->uint_64 = 0;
+
+ p = fs_lookup_key(desc, param->key);
+ if (!p) {
+ /* If we didn't find something that looks like "noxxx", see if
+ * "xxx" takes the "no"-form negative - but only if there
+ * wasn't an value.
+ */
+ if (result->has_value)
+ goto unknown_parameter;
+ if (param->key[0] != 'n' || param->key[1] != 'o' || !param->key[2])
+ goto unknown_parameter;
+
+ p = fs_lookup_key(desc, param->key + 2);
+ if (!p)
+ goto unknown_parameter;
+ if (!(p->flags & fs_param_neg_with_no))
+ goto unknown_parameter;
+ result->boolean = false;
+ result->negated = true;
+ }
+
+ if (p->flags & fs_param_deprecated)
+ warnf(fc, "%s: Deprecated parameter '%s'",
+ desc->name, param->key);
+
+ if (result->negated)
+ goto okay;
+
+ /* Certain parameter types only take a string and convert it. */
+ switch (p->type) {
+ case __fs_param_wasnt_defined:
+ return -EINVAL;
+ case fs_param_is_u32:
+ case fs_param_is_u32_octal:
+ case fs_param_is_u32_hex:
+ case fs_param_is_s32:
+ case fs_param_is_u64:
+ case fs_param_is_enum:
+ case fs_param_is_string:
+ if (param->type != fs_value_is_string)
+ goto bad_value;
+ if (!result->has_value) {
+ if (p->flags & fs_param_v_optional)
+ goto okay;
+ goto bad_value;
+ }
+ /* Fall through */
+ default:
+ break;
+ }
+
+ /* Try to turn the type we were given into the type desired by the
+ * parameter and give an error if we can't.
+ */
+ switch (p->type) {
+ case fs_param_is_flag:
+ if (param->type != fs_value_is_flag &&
+ (param->type != fs_value_is_string || result->has_value))
+ return invalf(fc, "%s: Unexpected value for '%s'",
+ desc->name, param->key);
+ result->boolean = true;
+ goto okay;
+
+ case fs_param_is_bool:
+ switch (param->type) {
+ case fs_value_is_flag:
+ result->boolean = true;
+ goto okay;
+ case fs_value_is_string:
+ if (param->size == 0) {
+ result->boolean = true;
+ goto okay;
+ }
+ b = lookup_constant(bool_names, param->string, -1);
+ if (b == -1)
+ goto bad_value;
+ result->boolean = b;
+ goto okay;
+ default:
+ goto bad_value;
+ }
+
+ case fs_param_is_u32:
+ ret = kstrtouint(param->string, 0, &result->uint_32);
+ goto maybe_okay;
+ case fs_param_is_u32_octal:
+ ret = kstrtouint(param->string, 8, &result->uint_32);
+ goto maybe_okay;
+ case fs_param_is_u32_hex:
+ ret = kstrtouint(param->string, 16, &result->uint_32);
+ goto maybe_okay;
+ case fs_param_is_s32:
+ ret = kstrtoint(param->string, 0, &result->int_32);
+ goto maybe_okay;
+ case fs_param_is_u64:
+ ret = kstrtoull(param->string, 0, &result->uint_64);
+ goto maybe_okay;
+
+ case fs_param_is_enum:
+ for (e = desc->enums; e->name[0]; e++) {
+ if (e->opt == p->opt &&
+ strcmp(e->name, param->string) == 0) {
+ result->uint_32 = e->value;
+ goto okay;
+ }
+ }
+ goto bad_value;
+
+ case fs_param_is_string:
+ goto okay;
+ case fs_param_is_blob:
+ if (param->type != fs_value_is_blob)
+ goto bad_value;
+ goto okay;
+
+ case fs_param_is_fd: {
+ if (param->type != fs_value_is_file)
+ goto bad_value;
+ goto okay;
+ }
+
+ case fs_param_is_blockdev:
+ case fs_param_is_path:
+ goto okay;
+ default:
+ BUG();
+ }
+
+maybe_okay:
+ if (ret < 0)
+ goto bad_value;
+okay:
+ return p->opt;
+
+bad_value:
+ return invalf(fc, "%s: Bad value for '%s'", desc->name, param->key);
+unknown_parameter:
+ return -ENOPARAM;
+}
+EXPORT_SYMBOL(fs_parse);
+
+/**
+ * fs_lookup_param - Look up a path referred to by a parameter
+ * @fc: The filesystem context to log errors through.
+ * @param: The parameter.
+ * @want_bdev: T if want a blockdev
+ * @_path: The result of the lookup
+ */
+int fs_lookup_param(struct fs_context *fc,
+ struct fs_parameter *param,
+ bool want_bdev,
+ struct path *_path)
+{
+ struct filename *f;
+ unsigned int flags = 0;
+ bool put_f;
+ int ret;
+
+ switch (param->type) {
+ case fs_value_is_string:
+ f = getname_kernel(param->string);
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+ put_f = true;
+ break;
+ case fs_value_is_filename_empty:
+ flags = LOOKUP_EMPTY;
+ /* Fall through */
+ case fs_value_is_filename:
+ f = param->name;
+ put_f = false;
+ break;
+ default:
+ return invalf(fc, "%s: not usable as path", param->key);
+ }
+
+ ret = filename_lookup(param->dirfd, f, flags, _path, NULL);
+ if (ret < 0) {
+ errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name);
+ goto out;
+ }
+
+ if (want_bdev &&
+ !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) {
+ path_put(_path);
+ _path->dentry = NULL;
+ _path->mnt = NULL;
+ errorf(fc, "%s: Non-blockdev passed as '%s'",
+ param->key, f->name);
+ ret = -ENOTBLK;
+ }
+
+out:
+ if (put_f)
+ putname(f);
+ return ret;
+}
+EXPORT_SYMBOL(fs_lookup_param);
+
+#ifdef CONFIG_VALIDATE_FS_PARSER
+/**
+ * validate_constant_table - Validate a constant table
+ * @name: Name to use in reporting
+ * @tbl: The constant table to validate.
+ * @tbl_size: The size of the table.
+ * @low: The lowest permissible value.
+ * @high: The highest permissible value.
+ * @special: One special permissible value outside of the range.
+ */
+bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
+ int low, int high, int special)
+{
+ size_t i;
+ bool good = true;
+
+ if (tbl_size == 0) {
+ pr_warn("VALIDATE C-TBL: Empty\n");
+ return true;
+ }
+
+ for (i = 0; i < tbl_size; i++) {
+ if (!tbl[i].name) {
+ pr_err("VALIDATE C-TBL[%zu]: Null\n", i);
+ good = false;
+ } else if (i > 0 && tbl[i - 1].name) {
+ int c = strcmp(tbl[i-1].name, tbl[i].name);
+
+ if (c == 0) {
+ pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n",
+ i, tbl[i].name);
+ good = false;
+ }
+ if (c > 0) {
+ pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n",
+ i, tbl[i-1].name, tbl[i].name);
+ good = false;
+ }
+ }
+
+ if (tbl[i].value != special &&
+ (tbl[i].value < low || tbl[i].value > high)) {
+ pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n",
+ i, tbl[i].name, tbl[i].value, low, high);
+ good = false;
+ }
+ }
+
+ return good;
+}
+
+/**
+ * fs_validate_description - Validate a parameter description
+ * @desc: The parameter description to validate.
+ */
+bool fs_validate_description(const struct fs_parameter_description *desc)
+{
+ const struct fs_parameter_spec *param, *p2;
+ const struct fs_parameter_enum *e;
+ const char *name = desc->name;
+ unsigned int nr_params = 0;
+ bool good = true, enums = false;
+
+ pr_notice("*** VALIDATE %s ***\n", name);
+
+ if (!name[0]) {
+ pr_err("VALIDATE Parser: No name\n");
+ name = "Unknown";
+ good = false;
+ }
+
+ if (desc->specs) {
+ for (param = desc->specs; param->name; param++) {
+ enum fs_parameter_type t = param->type;
+
+ /* Check that the type is in range */
+ if (t == __fs_param_wasnt_defined ||
+ t >= nr__fs_parameter_type) {
+ pr_err("VALIDATE %s: PARAM[%s] Bad type %u\n",
+ name, param->name, t);
+ good = false;
+ } else if (t == fs_param_is_enum) {
+ enums = true;
+ }
+
+ /* Check for duplicate parameter names */
+ for (p2 = desc->specs; p2 < param; p2++) {
+ if (strcmp(param->name, p2->name) == 0) {
+ pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n",
+ name, param->name);
+ good = false;
+ }
+ }
+ }
+
+ nr_params = param - desc->specs;
+ }
+
+ if (desc->enums) {
+ if (!nr_params) {
+ pr_err("VALIDATE %s: Enum table but no parameters\n",
+ name);
+ good = false;
+ goto no_enums;
+ }
+ if (!enums) {
+ pr_err("VALIDATE %s: Enum table but no enum-type values\n",
+ name);
+ good = false;
+ goto no_enums;
+ }
+
+ for (e = desc->enums; e->name[0]; e++) {
+ /* Check that all entries in the enum table have at
+ * least one parameter that uses them.
+ */
+ for (param = desc->specs; param->name; param++) {
+ if (param->opt == e->opt &&
+ param->type != fs_param_is_enum) {
+ pr_err("VALIDATE %s: e[%tu] enum val for %s\n",
+ name, e - desc->enums, param->name);
+ good = false;
+ }
+ }
+ }
+
+ /* Check that all enum-type parameters have at least one enum
+ * value in the enum table.
+ */
+ for (param = desc->specs; param->name; param++) {
+ if (param->type != fs_param_is_enum)
+ continue;
+ for (e = desc->enums; e->name[0]; e++)
+ if (e->opt == param->opt)
+ break;
+ if (!e->name[0]) {
+ pr_err("VALIDATE %s: PARAM[%s] enum with no values\n",
+ name, param->name);
+ good = false;
+ }
+ }
+ } else {
+ if (enums) {
+ pr_err("VALIDATE %s: enum-type values, but no enum table\n",
+ name);
+ good = false;
+ goto no_enums;
+ }
+ }
+
+no_enums:
+ return good;
+}
+#endif /* CONFIG_VALIDATE_FS_PARSER */
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 989df5accaee..fe80bea4ad89 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -35,7 +35,9 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
{
struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
if (fc) {
- fuse_abort_conn(fc, true);
+ if (fc->abort_err)
+ fc->aborted = true;
+ fuse_abort_conn(fc);
fuse_conn_put(fc);
}
return count;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 8f68181256c0..55a26f351467 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -141,10 +141,11 @@ static int cuse_open(struct inode *inode, struct file *file)
static int cuse_release(struct inode *inode, struct file *file)
{
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
- fuse_sync_release(ff, file->f_flags);
+ fuse_sync_release(fi, ff, file->f_flags);
fuse_conn_put(fc);
return 0;
@@ -407,7 +408,7 @@ err_unlock:
err_region:
unregister_chrdev_region(devt, 1);
err:
- fuse_abort_conn(fc, false);
+ fuse_abort_conn(fc);
goto out;
}
@@ -586,7 +587,7 @@ static ssize_t cuse_class_abort_store(struct device *dev,
{
struct cuse_conn *cc = dev_get_drvdata(dev);
- fuse_abort_conn(&cc->fc, false);
+ fuse_abort_conn(&cc->fc);
return count;
}
static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 809c0f2f9942..9971a35cf1ef 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,17 +251,18 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
struct file *file)
{
struct fuse_req *req = NULL;
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
struct fuse_file *ff = file->private_data;
do {
wait_event(fc->reserved_req_waitq, ff->reserved_req);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (ff->reserved_req) {
req = ff->reserved_req;
ff->reserved_req = NULL;
req->stolen_file = get_file(file);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
} while (!req);
return req;
@@ -273,16 +274,17 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
{
struct file *file = req->stolen_file;
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
struct fuse_file *ff = file->private_data;
WARN_ON(req->max_pages);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
memset(req, 0, sizeof(*req));
fuse_request_init(req, NULL, NULL, 0);
BUG_ON(ff->reserved_req);
ff->reserved_req = req;
wake_up_all(&fc->reserved_req_waitq);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fput(file);
}
@@ -431,10 +433,16 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
if (test_and_set_bit(FR_FINISHED, &req->flags))
goto put_request;
-
- spin_lock(&fiq->waitq.lock);
- list_del_init(&req->intr_entry);
- spin_unlock(&fiq->waitq.lock);
+ /*
+ * test_and_set_bit() implies smp_mb() between bit
+ * changing and below intr_entry check. Pairs with
+ * smp_mb() from queue_interrupt().
+ */
+ if (!list_empty(&req->intr_entry)) {
+ spin_lock(&fiq->waitq.lock);
+ list_del_init(&req->intr_entry);
+ spin_unlock(&fiq->waitq.lock);
+ }
WARN_ON(test_bit(FR_PENDING, &req->flags));
WARN_ON(test_bit(FR_SENT, &req->flags));
if (test_bit(FR_BACKGROUND, &req->flags)) {
@@ -462,27 +470,43 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
fc->active_background--;
flush_bg_queue(fc);
spin_unlock(&fc->bg_lock);
+ } else {
+ /* Wake up waiter sleeping in request_wait_answer() */
+ wake_up(&req->waitq);
}
- wake_up(&req->waitq);
+
if (req->end)
req->end(fc, req);
put_request:
fuse_put_request(fc, req);
}
-static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
{
spin_lock(&fiq->waitq.lock);
- if (test_bit(FR_FINISHED, &req->flags)) {
+ /* Check for we've sent request to interrupt this req */
+ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
spin_unlock(&fiq->waitq.lock);
- return;
+ return -EINVAL;
}
+
if (list_empty(&req->intr_entry)) {
list_add_tail(&req->intr_entry, &fiq->interrupts);
+ /*
+ * Pairs with smp_mb() implied by test_and_set_bit()
+ * from request_end().
+ */
+ smp_mb();
+ if (test_bit(FR_FINISHED, &req->flags)) {
+ list_del_init(&req->intr_entry);
+ spin_unlock(&fiq->waitq.lock);
+ return 0;
+ }
wake_up_locked(&fiq->waitq);
+ kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
}
spin_unlock(&fiq->waitq.lock);
- kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+ return 0;
}
static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
@@ -1306,7 +1330,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
goto err_unlock;
if (!fiq->connected) {
- err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
+ err = fc->aborted ? -ECONNABORTED : -ENODEV;
goto err_unlock;
}
@@ -1353,7 +1377,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
spin_lock(&fpq->lock);
clear_bit(FR_LOCKED, &req->flags);
if (!fpq->connected) {
- err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
+ err = fc->aborted ? -ECONNABORTED : -ENODEV;
goto out_end;
}
if (err) {
@@ -1900,16 +1924,17 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
struct fuse_req *req;
struct fuse_out_header oh;
+ err = -EINVAL;
if (nbytes < sizeof(struct fuse_out_header))
- return -EINVAL;
+ goto out;
err = fuse_copy_one(cs, &oh, sizeof(oh));
if (err)
- goto err_finish;
+ goto copy_finish;
err = -EINVAL;
if (oh.len != nbytes)
- goto err_finish;
+ goto copy_finish;
/*
* Zero oh.unique indicates unsolicited notification message
@@ -1917,41 +1942,40 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
*/
if (!oh.unique) {
err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
- return err ? err : nbytes;
+ goto out;
}
err = -EINVAL;
if (oh.error <= -1000 || oh.error > 0)
- goto err_finish;
+ goto copy_finish;
spin_lock(&fpq->lock);
- err = -ENOENT;
- if (!fpq->connected)
- goto err_unlock_pq;
+ req = NULL;
+ if (fpq->connected)
+ req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
- req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
- if (!req)
- goto err_unlock_pq;
+ err = -ENOENT;
+ if (!req) {
+ spin_unlock(&fpq->lock);
+ goto copy_finish;
+ }
/* Is it an interrupt reply ID? */
if (oh.unique & FUSE_INT_REQ_BIT) {
__fuse_get_request(req);
spin_unlock(&fpq->lock);
- err = -EINVAL;
- if (nbytes != sizeof(struct fuse_out_header)) {
- fuse_put_request(fc, req);
- goto err_finish;
- }
-
- if (oh.error == -ENOSYS)
+ err = 0;
+ if (nbytes != sizeof(struct fuse_out_header))
+ err = -EINVAL;
+ else if (oh.error == -ENOSYS)
fc->no_interrupt = 1;
else if (oh.error == -EAGAIN)
- queue_interrupt(&fc->iq, req);
+ err = queue_interrupt(&fc->iq, req);
+
fuse_put_request(fc, req);
- fuse_copy_finish(cs);
- return nbytes;
+ goto copy_finish;
}
clear_bit(FR_SENT, &req->flags);
@@ -1977,14 +2001,12 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
spin_unlock(&fpq->lock);
request_end(fc, req);
-
+out:
return err ? err : nbytes;
- err_unlock_pq:
- spin_unlock(&fpq->lock);
- err_finish:
+copy_finish:
fuse_copy_finish(cs);
- return err;
+ goto out;
}
static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
@@ -2034,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
ret = -EINVAL;
- if (rem < len) {
- pipe_unlock(pipe);
- goto out;
- }
+ if (rem < len)
+ goto out_free;
rem = len;
while (rem) {
@@ -2055,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
pipe->nrbufs--;
} else {
- pipe_buf_get(pipe, ibuf);
+ if (!pipe_buf_get(pipe, ibuf))
+ goto out_free;
+
*obuf = *ibuf;
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
obuf->len = rem;
@@ -2078,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
ret = fuse_dev_do_write(fud, &cs, len);
pipe_lock(pipe);
+out_free:
for (idx = 0; idx < nbuf; idx++)
pipe_buf_release(pipe, &bufs[idx]);
pipe_unlock(pipe);
-out:
kvfree(bufs);
return ret;
}
@@ -2109,11 +2131,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
return mask;
}
-/*
- * Abort all requests on the given list (pending or processing)
- *
- * This function releases and reacquires fc->lock
- */
+/* Abort all requests on the given list (pending or processing) */
static void end_requests(struct fuse_conn *fc, struct list_head *head)
{
while (!list_empty(head)) {
@@ -2159,7 +2177,7 @@ static void end_polls(struct fuse_conn *fc)
* is OK, the request will in that case be removed from the list before we touch
* it.
*/
-void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
+void fuse_abort_conn(struct fuse_conn *fc)
{
struct fuse_iqueue *fiq = &fc->iq;
@@ -2175,7 +2193,6 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
fc->connected = 0;
spin_unlock(&fc->bg_lock);
- fc->aborted = is_abort;
fuse_set_initialized(fc);
list_for_each_entry(fud, &fc->devices, entry) {
struct fuse_pqueue *fpq = &fud->pq;
@@ -2253,7 +2270,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
/* Are we the last open device? */
if (atomic_dec_and_test(&fc->dev_count)) {
WARN_ON(fc->iq.fasync != NULL);
- fuse_abort_conn(fc, false);
+ fuse_abort_conn(fc);
}
fuse_dev_free(fud);
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e909678afa2d..dd0f64f7bc06 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -149,21 +149,6 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
args->out.args[0].value = outarg;
}
-u64 fuse_get_attr_version(struct fuse_conn *fc)
-{
- u64 curr_version;
-
- /*
- * The spin lock isn't actually needed on 64bit archs, but we
- * don't yet care too much about such optimizations.
- */
- spin_lock(&fc->lock);
- curr_version = fc->attr_version;
- spin_unlock(&fc->lock);
-
- return curr_version;
-}
-
/*
* Check whether the dentry is still valid
*
@@ -222,9 +207,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
fuse_queue_forget(fc, forget, outarg.nodeid, 1);
goto invalid;
}
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
fi->nlookup++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
kfree(forget);
if (ret == -ENOMEM)
@@ -400,6 +385,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct fuse_create_in inarg;
struct fuse_open_out outopen;
struct fuse_entry_out outentry;
+ struct fuse_inode *fi;
struct fuse_file *ff;
/* Userspace expects S_IFREG in create mode */
@@ -451,7 +437,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
&outentry.attr, entry_attr_timeout(&outentry), 0);
if (!inode) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
- fuse_sync_release(ff, flags);
+ fuse_sync_release(NULL, ff, flags);
fuse_queue_forget(fc, forget, outentry.nodeid, 1);
err = -ENOMEM;
goto out_err;
@@ -462,7 +448,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
fuse_dir_changed(dir);
err = finish_open(file, entry, generic_file_open);
if (err) {
- fuse_sync_release(ff, flags);
+ fi = get_fuse_inode(inode);
+ fuse_sync_release(fi, ff, flags);
} else {
file->private_data = ff;
fuse_finish_open(inode, file);
@@ -671,8 +658,8 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
struct inode *inode = d_inode(entry);
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
/*
* If i_nlink == 0 then unlink doesn't make sense, yet this can
* happen if userspace filesystem is careless. It would be
@@ -681,7 +668,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
*/
if (inode->i_nlink > 0)
drop_nlink(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_invalidate_attr(inode);
fuse_dir_changed(dir);
fuse_invalidate_entry_cache(entry);
@@ -825,10 +812,10 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
if (!err) {
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
inc_nlink(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_invalidate_attr(inode);
fuse_update_ctime(inode);
} else if (err == -EINTR) {
@@ -1356,15 +1343,14 @@ static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
*/
void fuse_set_nowrite(struct inode *inode)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
BUG_ON(!inode_is_locked(inode));
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
BUG_ON(fi->writectr < 0);
fi->writectr += FUSE_NOWRITE;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
}
@@ -1385,11 +1371,11 @@ static void __fuse_release_nowrite(struct inode *inode)
void fuse_release_nowrite(struct inode *inode)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
__fuse_release_nowrite(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
@@ -1524,7 +1510,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
goto error;
}
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
/* the kernel maintains i_mtime locally */
if (trust_local_cmtime) {
if (attr->ia_valid & ATTR_MTIME)
@@ -1542,10 +1528,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
i_size_write(inode, outarg.attr.size);
if (is_truncate) {
- /* NOTE: this may release/reacquire fc->lock */
+ /* NOTE: this may release/reacquire fi->lock */
__fuse_release_nowrite(inode);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
/*
* Only call invalidate_inode_pages2() after removing
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a59c16bd90ac..06096b60f1df 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -19,8 +19,6 @@
#include <linux/falloc.h>
#include <linux/uio.h>
-static const struct file_operations fuse_direct_io_file_operations;
-
static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
int opcode, struct fuse_open_out *outargp)
{
@@ -64,9 +62,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
- spin_lock(&fc->lock);
- ff->kh = ++fc->khctr;
- spin_unlock(&fc->lock);
+ ff->kh = atomic64_inc_return(&fc->khctr);
return ff;
}
@@ -94,7 +90,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
if (refcount_dec_and_test(&ff->count)) {
struct fuse_req *req = ff->reserved_req;
- if (ff->fc->no_open && !isdir) {
+ if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
/*
* Drop the release request when client does not
* implement 'open'
@@ -128,8 +124,9 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
return -ENOMEM;
ff->fh = 0;
- ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
- if (!fc->no_open || isdir) {
+ /* Default for no-open */
+ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
+ if (isdir ? !fc->no_opendir : !fc->no_open) {
struct fuse_open_out outarg;
int err;
@@ -138,11 +135,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
ff->fh = outarg.fh;
ff->open_flags = outarg.open_flags;
- } else if (err != -ENOSYS || isdir) {
+ } else if (err != -ENOSYS) {
fuse_file_free(ff);
return err;
} else {
- fc->no_open = 1;
+ if (isdir)
+ fc->no_opendir = 1;
+ else
+ fc->no_open = 1;
}
}
@@ -159,17 +159,16 @@ EXPORT_SYMBOL_GPL(fuse_do_open);
static void fuse_link_write_file(struct file *file)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff = file->private_data;
/*
* file may be written through mmap, so chain it onto the
* inodes's write_file list
*/
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (list_empty(&ff->write_entry))
list_add(&ff->write_entry, &fi->write_files);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
void fuse_finish_open(struct inode *inode, struct file *file)
@@ -177,8 +176,6 @@ void fuse_finish_open(struct inode *inode, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (ff->open_flags & FOPEN_DIRECT_IO)
- file->f_op = &fuse_direct_io_file_operations;
if (!(ff->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
if (ff->open_flags & FOPEN_NONSEEKABLE)
@@ -186,10 +183,10 @@ void fuse_finish_open(struct inode *inode, struct file *file)
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_invalidate_attr(inode);
if (fc->writeback_cache)
file_update_time(file);
@@ -224,14 +221,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
return err;
}
-static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
+static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
+ int flags, int opcode)
{
struct fuse_conn *fc = ff->fc;
struct fuse_req *req = ff->reserved_req;
struct fuse_release_in *inarg = &req->misc.release.in;
+ /* Inode is NULL on error path of fuse_create_open() */
+ if (likely(fi)) {
+ spin_lock(&fi->lock);
+ list_del(&ff->write_entry);
+ spin_unlock(&fi->lock);
+ }
spin_lock(&fc->lock);
- list_del(&ff->write_entry);
if (!RB_EMPTY_NODE(&ff->polled_node))
rb_erase(&ff->polled_node, &fc->polled_files);
spin_unlock(&fc->lock);
@@ -249,11 +252,12 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
void fuse_release_common(struct file *file, bool isdir)
{
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
struct fuse_file *ff = file->private_data;
struct fuse_req *req = ff->reserved_req;
int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
- fuse_prepare_release(ff, file->f_flags, opcode);
+ fuse_prepare_release(fi, ff, file->f_flags, opcode);
if (ff->flock) {
struct fuse_release_in *inarg = &req->misc.release.in;
@@ -295,10 +299,10 @@ static int fuse_release(struct inode *inode, struct file *file)
return 0;
}
-void fuse_sync_release(struct fuse_file *ff, int flags)
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags)
{
WARN_ON(refcount_read(&ff->count) > 1);
- fuse_prepare_release(ff, flags, FUSE_RELEASE);
+ fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
/*
* iput(NULL) is a no-op and since the refcount is 1 and everything's
* synchronous, we are fine with not doing igrab() here"
@@ -329,33 +333,39 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
return (u64) v0 + ((u64) v1 << 32);
}
-/*
- * Check if any page in a range is under writeback
- *
- * This is currently done by walking the list of writepage requests
- * for the inode, which can be pretty inefficient.
- */
-static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
- pgoff_t idx_to)
+static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi,
+ pgoff_t idx_from, pgoff_t idx_to)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_req *req;
- bool found = false;
- spin_lock(&fc->lock);
list_for_each_entry(req, &fi->writepages, writepages_entry) {
pgoff_t curr_index;
- BUG_ON(req->inode != inode);
+ WARN_ON(get_fuse_inode(req->inode) != fi);
curr_index = req->misc.write.in.offset >> PAGE_SHIFT;
if (idx_from < curr_index + req->num_pages &&
curr_index <= idx_to) {
- found = true;
- break;
+ return req;
}
}
- spin_unlock(&fc->lock);
+ return NULL;
+}
+
+/*
+ * Check if any page in a range is under writeback
+ *
+ * This is currently done by walking the list of writepage requests
+ * for the inode, which can be pretty inefficient.
+ */
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+ pgoff_t idx_to)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ bool found;
+
+ spin_lock(&fi->lock);
+ found = fuse_find_writeback(fi, idx_from, idx_to);
+ spin_unlock(&fi->lock);
return found;
}
@@ -598,9 +608,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
- spin_unlock(&fc->lock);
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ spin_unlock(&fi->lock);
}
io->iocb->ki_complete(io->iocb, res, 0);
@@ -675,13 +685,13 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (attr_ver == fi->attr_version && size < inode->i_size &&
!test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
- fi->attr_version = ++fc->attr_version;
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, size);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
static void fuse_short_read(struct fuse_req *req, struct inode *inode,
@@ -919,7 +929,7 @@ out:
return err;
}
-static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -996,13 +1006,13 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos)
struct fuse_inode *fi = get_fuse_inode(inode);
bool ret = false;
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
if (pos > inode->i_size) {
i_size_write(inode, pos);
ret = true;
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
return ret;
}
@@ -1125,9 +1135,6 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
int err = 0;
ssize_t res = 0;
- if (is_bad_inode(inode))
- return -EIO;
-
if (inode->i_size < pos + iov_iter_count(ii))
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
@@ -1173,7 +1180,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
return res > 0 ? res : err;
}
-static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -1416,9 +1423,6 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
ssize_t res;
struct inode *inode = file_inode(io->iocb->ki_filp);
- if (is_bad_inode(inode))
- return -EIO;
-
res = fuse_direct_io(io, iter, ppos, 0);
fuse_invalidate_atime(inode);
@@ -1426,10 +1430,21 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
return res;
}
+static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
+
static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
- return __fuse_direct_read(&io, to, &iocb->ki_pos);
+ ssize_t res;
+
+ if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+ res = fuse_direct_IO(iocb, to);
+ } else {
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+
+ res = __fuse_direct_read(&io, to, &iocb->ki_pos);
+ }
+
+ return res;
}
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -1438,14 +1453,17 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
ssize_t res;
- if (is_bad_inode(inode))
- return -EIO;
-
/* Don't allow parallel writes to the same file */
inode_lock(inode);
res = generic_write_checks(iocb, from);
- if (res > 0)
- res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+ if (res > 0) {
+ if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+ res = fuse_direct_IO(iocb, from);
+ } else {
+ res = fuse_direct_io(&io, from, &iocb->ki_pos,
+ FUSE_DIO_WRITE);
+ }
+ }
fuse_invalidate_attr(inode);
if (res > 0)
fuse_write_update_size(inode, iocb->ki_pos);
@@ -1454,6 +1472,34 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
return res;
}
+static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+
+ if (is_bad_inode(file_inode(file)))
+ return -EIO;
+
+ if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_cache_read_iter(iocb, to);
+ else
+ return fuse_direct_read_iter(iocb, to);
+}
+
+static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+
+ if (is_bad_inode(file_inode(file)))
+ return -EIO;
+
+ if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_cache_write_iter(iocb, from);
+ else
+ return fuse_direct_write_iter(iocb, from);
+}
+
static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
{
int i;
@@ -1481,20 +1527,18 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&fi->page_waitq);
}
-/* Called under fc->lock, may release and reacquire it */
+/* Called under fi->lock, may release and reacquire it */
static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
loff_t size)
-__releases(fc->lock)
-__acquires(fc->lock)
+__releases(fi->lock)
+__acquires(fi->lock)
{
+ struct fuse_req *aux, *next;
struct fuse_inode *fi = get_fuse_inode(req->inode);
struct fuse_write_in *inarg = &req->misc.write.in;
__u64 data_size = req->num_pages * PAGE_SIZE;
bool queued;
- if (!fc->connected)
- goto out_free;
-
if (inarg->offset + data_size <= size) {
inarg->size = data_size;
} else if (inarg->offset < size) {
@@ -1505,28 +1549,40 @@ __acquires(fc->lock)
}
req->in.args[1].size = inarg->size;
- fi->writectr++;
queued = fuse_request_queue_background(fc, req);
- WARN_ON(!queued);
+ /* Fails on broken connection only */
+ if (unlikely(!queued))
+ goto out_free;
+
+ fi->writectr++;
return;
out_free:
fuse_writepage_finish(fc, req);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
+
+ /* After fuse_writepage_finish() aux request list is private */
+ for (aux = req->misc.write.next; aux; aux = next) {
+ next = aux->misc.write.next;
+ aux->misc.write.next = NULL;
+ fuse_writepage_free(fc, aux);
+ fuse_put_request(fc, aux);
+ }
+
fuse_writepage_free(fc, req);
fuse_put_request(fc, req);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
}
/*
* If fi->writectr is positive (no truncate or fsync going on) send
* all queued writepage requests.
*
- * Called with fc->lock
+ * Called with fi->lock
*/
void fuse_flush_writepages(struct inode *inode)
-__releases(fc->lock)
-__acquires(fc->lock)
+__releases(fi->lock)
+__acquires(fi->lock)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1546,7 +1602,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
struct fuse_inode *fi = get_fuse_inode(inode);
mapping_set_error(inode->i_mapping, req->out.h.error);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
while (req->misc.write.next) {
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_write_in *inarg = &req->misc.write.in;
@@ -1583,7 +1639,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
}
fi->writectr--;
fuse_writepage_finish(fc, req);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_writepage_free(fc, req);
}
@@ -1592,13 +1648,13 @@ static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
{
struct fuse_file *ff = NULL;
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (!list_empty(&fi->write_files)) {
ff = list_entry(fi->write_files.next, struct fuse_file,
write_entry);
fuse_file_get(ff);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
return ff;
}
@@ -1669,11 +1725,11 @@ static int fuse_writepage_locked(struct page *page)
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_add(&req->writepages_entry, &fi->writepages);
list_add_tail(&req->list, &fi->queued_writes);
fuse_flush_writepages(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
end_page_writeback(page);
@@ -1722,21 +1778,27 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
{
struct fuse_req *req = data->req;
struct inode *inode = data->inode;
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
int num_pages = req->num_pages;
int i;
req->ff = fuse_file_get(data->ff);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_add_tail(&req->list, &fi->queued_writes);
fuse_flush_writepages(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
for (i = 0; i < num_pages; i++)
end_page_writeback(data->orig_pages[i]);
}
+/*
+ * First recheck under fi->lock if the offending offset is still under
+ * writeback. If yes, then iterate auxiliary write requests, to see if there's
+ * one already added for a page at this offset. If there's none, then insert
+ * this new request onto the auxiliary list, otherwise reuse the existing one by
+ * copying the new page contents over to the old temporary page.
+ */
static bool fuse_writepage_in_flight(struct fuse_req *new_req,
struct page *page)
{
@@ -1744,57 +1806,50 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
struct fuse_inode *fi = get_fuse_inode(new_req->inode);
struct fuse_req *tmp;
struct fuse_req *old_req;
- bool found = false;
- pgoff_t curr_index;
- BUG_ON(new_req->num_pages != 0);
+ WARN_ON(new_req->num_pages != 0);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_del(&new_req->writepages_entry);
- list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
- BUG_ON(old_req->inode != new_req->inode);
- curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT;
- if (curr_index <= page->index &&
- page->index < curr_index + old_req->num_pages) {
- found = true;
- break;
- }
- }
- if (!found) {
+ old_req = fuse_find_writeback(fi, page->index, page->index);
+ if (!old_req) {
list_add(&new_req->writepages_entry, &fi->writepages);
- goto out_unlock;
+ spin_unlock(&fi->lock);
+ return false;
}
new_req->num_pages = 1;
- for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
- BUG_ON(tmp->inode != new_req->inode);
+ for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) {
+ pgoff_t curr_index;
+
+ WARN_ON(tmp->inode != new_req->inode);
curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT;
- if (tmp->num_pages == 1 &&
- curr_index == page->index) {
- old_req = tmp;
+ if (curr_index == page->index) {
+ WARN_ON(tmp->num_pages != 1);
+ WARN_ON(!test_bit(FR_PENDING, &tmp->flags));
+ swap(tmp->pages[0], new_req->pages[0]);
+ break;
}
}
- if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) {
- struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
+ if (!tmp) {
+ new_req->misc.write.next = old_req->misc.write.next;
+ old_req->misc.write.next = new_req;
+ }
+
+ spin_unlock(&fi->lock);
- copy_highpage(old_req->pages[0], page);
- spin_unlock(&fc->lock);
+ if (tmp) {
+ struct backing_dev_info *bdi = inode_to_bdi(new_req->inode);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
- goto out;
- } else {
- new_req->misc.write.next = old_req->misc.write.next;
- old_req->misc.write.next = new_req;
}
-out_unlock:
- spin_unlock(&fc->lock);
-out:
- return found;
+
+ return true;
}
static int fuse_writepages_fill(struct page *page,
@@ -1803,6 +1858,7 @@ static int fuse_writepages_fill(struct page *page,
struct fuse_fill_wb_data *data = _data;
struct fuse_req *req = data->req;
struct inode *inode = data->inode;
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
struct page *tmp_page;
bool is_writeback;
@@ -1873,9 +1929,9 @@ static int fuse_writepages_fill(struct page *page,
req->end = fuse_writepage_end;
req->inode = inode;
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_add(&req->writepages_entry, &fi->writepages);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
data->req = req;
}
@@ -1898,12 +1954,12 @@ static int fuse_writepages_fill(struct page *page,
data->orig_pages[req->num_pages] = page;
/*
- * Protected by fc->lock against concurrent access by
+ * Protected by fi->lock against concurrent access by
* fuse_page_is_writeback().
*/
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
req->num_pages++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
out_unlock:
unlock_page(page);
@@ -2087,6 +2143,18 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct fuse_file *ff = file->private_data;
+
+ if (ff->open_flags & FOPEN_DIRECT_IO) {
+ /* Can't provide the coherency needed for MAP_SHARED */
+ if (vma->vm_flags & VM_MAYSHARE)
+ return -ENODEV;
+
+ invalidate_inode_pages2(file->f_mapping);
+
+ return generic_file_mmap(file, vma);
+ }
+
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
fuse_link_write_file(file);
@@ -2095,17 +2163,6 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
-{
- /* Can't provide the coherency needed for MAP_SHARED */
- if (vma->vm_flags & VM_MAYSHARE)
- return -ENODEV;
-
- invalidate_inode_pages2(file->f_mapping);
-
- return generic_file_mmap(file, vma);
-}
-
static int convert_fuse_file_lock(struct fuse_conn *fc,
const struct fuse_file_lock *ffl,
struct file_lock *fl)
@@ -3114,6 +3171,7 @@ static const struct file_operations fuse_file_operations = {
.lock = fuse_file_lock,
.flock = fuse_file_flock,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = fuse_file_ioctl,
.compat_ioctl = fuse_file_compat_ioctl,
.poll = fuse_file_poll,
@@ -3121,24 +3179,6 @@ static const struct file_operations fuse_file_operations = {
.copy_file_range = fuse_copy_file_range,
};
-static const struct file_operations fuse_direct_io_file_operations = {
- .llseek = fuse_file_llseek,
- .read_iter = fuse_direct_read_iter,
- .write_iter = fuse_direct_write_iter,
- .mmap = fuse_direct_mmap,
- .open = fuse_open,
- .flush = fuse_flush,
- .release = fuse_release,
- .fsync = fuse_fsync,
- .lock = fuse_file_lock,
- .flock = fuse_file_flock,
- .unlocked_ioctl = fuse_file_ioctl,
- .compat_ioctl = fuse_file_compat_ioctl,
- .poll = fuse_file_poll,
- .fallocate = fuse_file_fallocate,
- /* no splice_read */
-};
-
static const struct address_space_operations fuse_file_aops = {
.readpage = fuse_readpage,
.writepage = fuse_writepage,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2f2c92e6f8cb..0920c0c032a0 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -96,7 +96,7 @@ struct fuse_inode {
union {
/* Write related fields (regular file only) */
struct {
- /* Files usable in writepage. Protected by fc->lock */
+ /* Files usable in writepage. Protected by fi->lock */
struct list_head write_files;
/* Writepages pending on truncate or fsync */
@@ -144,6 +144,9 @@ struct fuse_inode {
/** Lock for serializing lookup and readdir for back compatibility*/
struct mutex mutex;
+
+ /** Lock to protect write related fields */
+ spinlock_t lock;
};
/** FUSE inode state bits */
@@ -163,7 +166,10 @@ struct fuse_file {
/** Fuse connection for this file */
struct fuse_conn *fc;
- /** Request reserved for flush and release */
+ /*
+ * Request reserved for flush and release.
+ * Modified under relative fuse_inode::lock.
+ */
struct fuse_req *reserved_req;
/** Kernel file handle guaranteed to be unique */
@@ -538,7 +544,7 @@ struct fuse_conn {
struct fuse_iqueue iq;
/** The next unique kernel file handle */
- u64 khctr;
+ atomic64_t khctr;
/** rbtree of fuse_files waiting for poll events indexed by ph */
struct rb_root polled_files;
@@ -624,6 +630,9 @@ struct fuse_conn {
/** Is open/release not implemented by fs? */
unsigned no_open:1;
+ /** Is opendir/releasedir not implemented by fs? */
+ unsigned no_opendir:1;
+
/** Is fsync not implemented by fs? */
unsigned no_fsync:1;
@@ -730,7 +739,7 @@ struct fuse_conn {
struct fuse_req *destroy_req;
/** Version counter for attribute changes */
- u64 attr_version;
+ atomic64_t attr_version;
/** Called on final put */
void (*release)(struct fuse_conn *);
@@ -770,6 +779,11 @@ static inline int invalid_nodeid(u64 nodeid)
return !nodeid || nodeid == FUSE_ROOT_ID;
}
+static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
+{
+ return atomic64_read(&fc->attr_version);
+}
+
/** Device operations */
extern const struct file_operations fuse_dev_operations;
@@ -817,7 +831,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
void fuse_file_free(struct fuse_file *ff);
void fuse_finish_open(struct inode *inode, struct file *file);
-void fuse_sync_release(struct fuse_file *ff, int flags);
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags);
/**
* Send RELEASE or RELEASEDIR request
@@ -936,7 +950,7 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req);
/* Abort all requests */
-void fuse_abort_conn(struct fuse_conn *fc, bool is_abort);
+void fuse_abort_conn(struct fuse_conn *fc);
void fuse_wait_aborted(struct fuse_conn *fc);
/**
@@ -1000,8 +1014,6 @@ void fuse_flush_writepages(struct inode *inode);
void fuse_set_nowrite(struct inode *inode);
void fuse_release_nowrite(struct inode *inode);
-u64 fuse_get_attr_version(struct fuse_conn *fc);
-
/**
* File-system tells the kernel to invalidate cache for the given node id.
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c2d4099429be..ec5d9953dfb6 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->orig_ino = 0;
fi->state = 0;
mutex_init(&fi->mutex);
+ spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
if (!fi->forget) {
kmem_cache_free(fuse_inode_cachep, inode);
@@ -163,7 +164,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- fi->attr_version = ++fc->attr_version;
+ lockdep_assert_held(&fi->lock);
+
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
fi->i_time = attr_valid;
WRITE_ONCE(fi->inval_mask, 0);
@@ -209,10 +212,10 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
loff_t oldsize;
struct timespec64 old_mtime;
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if ((attr_version != 0 && fi->attr_version > attr_version) ||
test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
return;
}
@@ -227,7 +230,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
*/
if (!is_wb || !S_ISREG(inode->i_mode))
i_size_write(inode, attr->size);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
if (!is_wb && S_ISREG(inode->i_mode)) {
bool inval = false;
@@ -322,9 +325,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
}
fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
fi->nlookup++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_change_attributes(inode, attr, attr_valid, attr_version);
return inode;
@@ -376,7 +379,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked)
static void fuse_umount_begin(struct super_block *sb)
{
- fuse_abort_conn(get_fuse_conn_super(sb), false);
+ fuse_abort_conn(get_fuse_conn_super(sb));
}
static void fuse_send_destroy(struct fuse_conn *fc)
@@ -619,12 +622,12 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
atomic_set(&fc->num_waiting, 0);
fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
- fc->khctr = 0;
+ atomic64_set(&fc->khctr, 0);
fc->polled_files = RB_ROOT;
fc->blocked = 0;
fc->initialized = 0;
fc->connected = 1;
- fc->attr_version = 1;
+ atomic64_set(&fc->attr_version, 1);
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
fc->user_ns = get_user_ns(user_ns);
@@ -969,7 +972,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
- FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS;
+ FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
+ FUSE_NO_OPENDIR_SUPPORT;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -1010,7 +1014,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
if (err)
return err;
- sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* fuse does it's own writeback accounting */
sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
@@ -1242,7 +1246,7 @@ static void fuse_sb_destroy(struct super_block *sb)
if (fc) {
fuse_send_destroy(fc);
- fuse_abort_conn(fc, false);
+ fuse_abort_conn(fc);
fuse_wait_aborted(fc);
down_write(&fc->killsb);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index ab18b78f4755..574d03f8a573 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -213,9 +213,9 @@ retry:
}
fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
fi->nlookup++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
forget_all_cached_acls(inode);
fuse_change_attributes(inode, &o->attr,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index b92740edc416..d32964cd1117 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -107,7 +107,7 @@ static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode,
static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name)
{
- u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0);
+ u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0);
return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS);
}
@@ -2131,71 +2131,29 @@ static const struct file_operations gfs2_sbstats_fops = {
.release = seq_release,
};
-int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
-{
- struct dentry *dent;
-
- dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dir = dent;
-
- dent = debugfs_create_file("glocks",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_glocks_fops);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dentry_glocks = dent;
-
- dent = debugfs_create_file("glstats",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_glstats_fops);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dentry_glstats = dent;
-
- dent = debugfs_create_file("sbstats",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_sbstats_fops);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dentry_sbstats = dent;
+void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
+{
+ sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
- return 0;
-fail:
- gfs2_delete_debugfs_file(sdp);
- return dent ? PTR_ERR(dent) : -ENOMEM;
+ debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+ &gfs2_glocks_fops);
+
+ debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+ &gfs2_glstats_fops);
+
+ debugfs_create_file("sbstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+ &gfs2_sbstats_fops);
}
void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
{
- if (sdp->debugfs_dir) {
- if (sdp->debugfs_dentry_glocks) {
- debugfs_remove(sdp->debugfs_dentry_glocks);
- sdp->debugfs_dentry_glocks = NULL;
- }
- if (sdp->debugfs_dentry_glstats) {
- debugfs_remove(sdp->debugfs_dentry_glstats);
- sdp->debugfs_dentry_glstats = NULL;
- }
- if (sdp->debugfs_dentry_sbstats) {
- debugfs_remove(sdp->debugfs_dentry_sbstats);
- sdp->debugfs_dentry_sbstats = NULL;
- }
- debugfs_remove(sdp->debugfs_dir);
- sdp->debugfs_dir = NULL;
- }
+ debugfs_remove_recursive(sdp->debugfs_dir);
+ sdp->debugfs_dir = NULL;
}
-int gfs2_register_debugfs(void)
+void gfs2_register_debugfs(void)
{
gfs2_root = debugfs_create_dir("gfs2", NULL);
- if (IS_ERR(gfs2_root))
- return PTR_ERR(gfs2_root);
- return gfs2_root ? 0 : -ENOMEM;
}
void gfs2_unregister_debugfs(void)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 8949bf28b249..936b3295839c 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -243,9 +243,9 @@ extern void gfs2_glock_free(struct gfs2_glock *gl);
extern int __init gfs2_glock_init(void);
extern void gfs2_glock_exit(void);
-extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-extern int gfs2_register_debugfs(void);
+extern void gfs2_register_debugfs(void);
extern void gfs2_unregister_debugfs(void);
extern const struct lm_lockops gfs2_dlm_ops;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e10e0b0a7cd5..cdf07b408f54 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -853,9 +853,6 @@ struct gfs2_sbd {
unsigned long sd_last_warning;
struct dentry *debugfs_dir; /* debugfs directory */
- struct dentry *debugfs_dentry_glocks;
- struct dentry *debugfs_dentry_glstats;
- struct dentry *debugfs_dentry_sbstats;
};
static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 793808263c6d..18d4af7417fa 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -59,8 +59,8 @@ static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
{
- gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change));
- change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK);
+ change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT;
+ gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change));
inode->i_blocks += change;
}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c7603063f861..136484ef35d3 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -178,16 +178,12 @@ static int __init init_gfs2_fs(void)
if (!gfs2_page_pool)
goto fail_mempool;
- error = gfs2_register_debugfs();
- if (error)
- goto fail_debugfs;
+ gfs2_register_debugfs();
pr_info("GFS2 installed\n");
return 0;
-fail_debugfs:
- mempool_destroy(gfs2_page_pool);
fail_mempool:
destroy_workqueue(gfs2_freeze_wq);
fail_wq3:
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
index 823a328791c0..302f45101a96 100644
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h
@@ -120,11 +120,11 @@ struct hpfs_spare_block
u8 bad_sector: 1; /* bad sector, corrupted disk (???) */
u8 bad_bitmap: 1; /* bad bitmap */
u8 fast: 1; /* partition was fast formatted */
- u8 old_wrote: 1; /* old version wrote to partion */
- u8 old_wrote_1: 1; /* old version wrote to partion (?) */
+ u8 old_wrote: 1; /* old version wrote to partition */
+ u8 old_wrote_1: 1; /* old version wrote to partition (?) */
#else
- u8 old_wrote_1: 1; /* old version wrote to partion (?) */
- u8 old_wrote: 1; /* old version wrote to partion */
+ u8 old_wrote_1: 1; /* old version wrote to partition (?) */
+ u8 old_wrote: 1; /* old version wrote to partition */
u8 fast: 1; /* partition was fast formatted */
u8 bad_bitmap: 1; /* bad bitmap */
u8 bad_sector: 1; /* bad sector, corrupted disk (???) */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b0eef008de67..9285dd4f4b1c 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -27,7 +27,7 @@
#include <linux/backing-dev.h>
#include <linux/hugetlb.h>
#include <linux/pagevec.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/dnotify.h>
@@ -45,11 +45,17 @@ const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
-struct hugetlbfs_config {
+enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
+
+struct hugetlbfs_fs_context {
struct hstate *hstate;
+ unsigned long long max_size_opt;
+ unsigned long long min_size_opt;
long max_hpages;
long nr_inodes;
long min_hpages;
+ enum hugetlbfs_size_type max_val_type;
+ enum hugetlbfs_size_type min_val_type;
kuid_t uid;
kgid_t gid;
umode_t mode;
@@ -57,22 +63,30 @@ struct hugetlbfs_config {
int sysctl_hugetlb_shm_group;
-enum {
- Opt_size, Opt_nr_inodes,
- Opt_mode, Opt_uid, Opt_gid,
- Opt_pagesize, Opt_min_size,
- Opt_err,
+enum hugetlb_param {
+ Opt_gid,
+ Opt_min_size,
+ Opt_mode,
+ Opt_nr_inodes,
+ Opt_pagesize,
+ Opt_size,
+ Opt_uid,
};
-static const match_table_t tokens = {
- {Opt_size, "size=%s"},
- {Opt_nr_inodes, "nr_inodes=%s"},
- {Opt_mode, "mode=%o"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_pagesize, "pagesize=%s"},
- {Opt_min_size, "min_size=%s"},
- {Opt_err, NULL},
+static const struct fs_parameter_spec hugetlb_param_specs[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_string("min_size", Opt_min_size),
+ fsparam_u32 ("mode", Opt_mode),
+ fsparam_string("nr_inodes", Opt_nr_inodes),
+ fsparam_string("pagesize", Opt_pagesize),
+ fsparam_string("size", Opt_size),
+ fsparam_u32 ("uid", Opt_uid),
+ {}
+};
+
+static const struct fs_parameter_description hugetlb_fs_parameters = {
+ .name = "hugetlbfs",
+ .specs = hugetlb_param_specs,
};
#ifdef CONFIG_NUMA
@@ -708,16 +722,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
}
static struct inode *hugetlbfs_get_root(struct super_block *sb,
- struct hugetlbfs_config *config)
+ struct hugetlbfs_fs_context *ctx)
{
struct inode *inode;
inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_mode = S_IFDIR | config->mode;
- inode->i_uid = config->uid;
- inode->i_gid = config->gid;
+ inode->i_mode = S_IFDIR | ctx->mode;
+ inode->i_uid = ctx->uid;
+ inode->i_gid = ctx->gid;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_op = &hugetlbfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
@@ -741,11 +755,17 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
umode_t mode, dev_t dev)
{
struct inode *inode;
- struct resv_map *resv_map;
+ struct resv_map *resv_map = NULL;
- resv_map = resv_map_alloc();
- if (!resv_map)
- return NULL;
+ /*
+ * Reserve maps are only needed for inodes that can have associated
+ * page allocations.
+ */
+ if (S_ISREG(mode) || S_ISLNK(mode)) {
+ resv_map = resv_map_alloc();
+ if (!resv_map)
+ return NULL;
+ }
inode = new_inode(sb);
if (inode) {
@@ -780,8 +800,10 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
break;
}
lockdep_annotate_inode_mutex_key(inode);
- } else
- kref_put(&resv_map->refs, resv_map_release);
+ } else {
+ if (resv_map)
+ kref_put(&resv_map->refs, resv_map_release);
+ }
return inode;
}
@@ -1093,8 +1115,6 @@ static const struct super_operations hugetlbfs_ops = {
.show_options = hugetlbfs_show_options,
};
-enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
-
/*
* Convert size option passed from command line to number of huge pages
* in the pool specified by hstate. Size option could be in bytes
@@ -1117,170 +1137,151 @@ hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
return size_opt;
}
-static int
-hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
+/*
+ * Parse one mount parameter.
+ */
+static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p, *rest;
- substring_t args[MAX_OPT_ARGS];
- int option;
- unsigned long long max_size_opt = 0, min_size_opt = 0;
- enum hugetlbfs_size_type max_val_type = NO_SIZE, min_val_type = NO_SIZE;
-
- if (!options)
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ char *rest;
+ unsigned long ps;
+ int opt;
+
+ opt = fs_parse(fc, &hugetlb_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ ctx->uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(ctx->uid))
+ goto bad_val;
return 0;
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
+ case Opt_gid:
+ ctx->gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(ctx->gid))
+ goto bad_val;
+ return 0;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- goto bad_val;
- pconfig->uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(pconfig->uid))
- goto bad_val;
- break;
+ case Opt_mode:
+ ctx->mode = result.uint_32 & 01777U;
+ return 0;
- case Opt_gid:
- if (match_int(&args[0], &option))
- goto bad_val;
- pconfig->gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(pconfig->gid))
- goto bad_val;
- break;
+ case Opt_size:
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(param->string[0]))
+ goto bad_val;
+ ctx->max_size_opt = memparse(param->string, &rest);
+ ctx->max_val_type = SIZE_STD;
+ if (*rest == '%')
+ ctx->max_val_type = SIZE_PERCENT;
+ return 0;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- goto bad_val;
- pconfig->mode = option & 01777U;
- break;
+ case Opt_nr_inodes:
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(param->string[0]))
+ goto bad_val;
+ ctx->nr_inodes = memparse(param->string, &rest);
+ return 0;
- case Opt_size: {
- /* memparse() will accept a K/M/G without a digit */
- if (!isdigit(*args[0].from))
- goto bad_val;
- max_size_opt = memparse(args[0].from, &rest);
- max_val_type = SIZE_STD;
- if (*rest == '%')
- max_val_type = SIZE_PERCENT;
- break;
+ case Opt_pagesize:
+ ps = memparse(param->string, &rest);
+ ctx->hstate = size_to_hstate(ps);
+ if (!ctx->hstate) {
+ pr_err("Unsupported page size %lu MB\n", ps >> 20);
+ return -EINVAL;
}
+ return 0;
- case Opt_nr_inodes:
- /* memparse() will accept a K/M/G without a digit */
- if (!isdigit(*args[0].from))
- goto bad_val;
- pconfig->nr_inodes = memparse(args[0].from, &rest);
- break;
+ case Opt_min_size:
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(param->string[0]))
+ goto bad_val;
+ ctx->min_size_opt = memparse(param->string, &rest);
+ ctx->min_val_type = SIZE_STD;
+ if (*rest == '%')
+ ctx->min_val_type = SIZE_PERCENT;
+ return 0;
- case Opt_pagesize: {
- unsigned long ps;
- ps = memparse(args[0].from, &rest);
- pconfig->hstate = size_to_hstate(ps);
- if (!pconfig->hstate) {
- pr_err("Unsupported page size %lu MB\n",
- ps >> 20);
- return -EINVAL;
- }
- break;
- }
+ default:
+ return -EINVAL;
+ }
- case Opt_min_size: {
- /* memparse() will accept a K/M/G without a digit */
- if (!isdigit(*args[0].from))
- goto bad_val;
- min_size_opt = memparse(args[0].from, &rest);
- min_val_type = SIZE_STD;
- if (*rest == '%')
- min_val_type = SIZE_PERCENT;
- break;
- }
+bad_val:
+ return invalf(fc, "hugetlbfs: Bad value '%s' for mount option '%s'\n",
+ param->string, param->key);
+}
- default:
- pr_err("Bad mount option: \"%s\"\n", p);
- return -EINVAL;
- break;
- }
- }
+/*
+ * Validate the parsed options.
+ */
+static int hugetlbfs_validate(struct fs_context *fc)
+{
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
/*
* Use huge page pool size (in hstate) to convert the size
* options to number of huge pages. If NO_SIZE, -1 is returned.
*/
- pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
- max_size_opt, max_val_type);
- pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
- min_size_opt, min_val_type);
+ ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
+ ctx->max_size_opt,
+ ctx->max_val_type);
+ ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
+ ctx->min_size_opt,
+ ctx->min_val_type);
/*
* If max_size was specified, then min_size must be smaller
*/
- if (max_val_type > NO_SIZE &&
- pconfig->min_hpages > pconfig->max_hpages) {
- pr_err("minimum size can not be greater than maximum size\n");
+ if (ctx->max_val_type > NO_SIZE &&
+ ctx->min_hpages > ctx->max_hpages) {
+ pr_err("Minimum size can not be greater than maximum size\n");
return -EINVAL;
}
return 0;
-
-bad_val:
- pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
- return -EINVAL;
}
static int
-hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
+hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
- int ret;
- struct hugetlbfs_config config;
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
struct hugetlbfs_sb_info *sbinfo;
- config.max_hpages = -1; /* No limit on size by default */
- config.nr_inodes = -1; /* No limit on number of inodes by default */
- config.uid = current_fsuid();
- config.gid = current_fsgid();
- config.mode = 0755;
- config.hstate = &default_hstate;
- config.min_hpages = -1; /* No default minimum size */
- ret = hugetlbfs_parse_options(data, &config);
- if (ret)
- return ret;
-
sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
if (!sbinfo)
return -ENOMEM;
sb->s_fs_info = sbinfo;
- sbinfo->hstate = config.hstate;
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_inodes = config.nr_inodes;
- sbinfo->free_inodes = config.nr_inodes;
- sbinfo->spool = NULL;
- sbinfo->uid = config.uid;
- sbinfo->gid = config.gid;
- sbinfo->mode = config.mode;
+ sbinfo->hstate = ctx->hstate;
+ sbinfo->max_inodes = ctx->nr_inodes;
+ sbinfo->free_inodes = ctx->nr_inodes;
+ sbinfo->spool = NULL;
+ sbinfo->uid = ctx->uid;
+ sbinfo->gid = ctx->gid;
+ sbinfo->mode = ctx->mode;
/*
* Allocate and initialize subpool if maximum or minimum size is
* specified. Any needed reservations (for minimim size) are taken
* taken when the subpool is created.
*/
- if (config.max_hpages != -1 || config.min_hpages != -1) {
- sbinfo->spool = hugepage_new_subpool(config.hstate,
- config.max_hpages,
- config.min_hpages);
+ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
+ sbinfo->spool = hugepage_new_subpool(ctx->hstate,
+ ctx->max_hpages,
+ ctx->min_hpages);
if (!sbinfo->spool)
goto out_free;
}
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = huge_page_size(config.hstate);
- sb->s_blocksize_bits = huge_page_shift(config.hstate);
+ sb->s_blocksize = huge_page_size(ctx->hstate);
+ sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_time_gran = 1;
- sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
+ sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
if (!sb->s_root)
goto out_free;
return 0;
@@ -1290,16 +1291,52 @@ out_free:
return -ENOMEM;
}
-static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hugetlbfs_get_tree(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+ int err = hugetlbfs_validate(fc);
+ if (err)
+ return err;
+ return vfs_get_super(fc, vfs_get_independent_super, hugetlbfs_fill_super);
+}
+
+static void hugetlbfs_fs_context_free(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations hugetlbfs_fs_context_ops = {
+ .free = hugetlbfs_fs_context_free,
+ .parse_param = hugetlbfs_parse_param,
+ .get_tree = hugetlbfs_get_tree,
+};
+
+static int hugetlbfs_init_fs_context(struct fs_context *fc)
+{
+ struct hugetlbfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->max_hpages = -1; /* No limit on size by default */
+ ctx->nr_inodes = -1; /* No limit on number of inodes by default */
+ ctx->uid = current_fsuid();
+ ctx->gid = current_fsgid();
+ ctx->mode = 0755;
+ ctx->hstate = &default_hstate;
+ ctx->min_hpages = -1; /* No default minimum size */
+ ctx->max_val_type = NO_SIZE;
+ ctx->min_val_type = NO_SIZE;
+ fc->fs_private = ctx;
+ fc->ops = &hugetlbfs_fs_context_ops;
+ return 0;
}
static struct file_system_type hugetlbfs_fs_type = {
- .name = "hugetlbfs",
- .mount = hugetlbfs_mount,
- .kill_sb = kill_litter_super,
+ .name = "hugetlbfs",
+ .init_fs_context = hugetlbfs_init_fs_context,
+ .parameters = &hugetlb_fs_parameters,
+ .kill_sb = kill_litter_super,
};
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1384,8 +1421,29 @@ out:
return file;
}
+static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
+{
+ struct fs_context *fc;
+ struct vfsmount *mnt;
+
+ fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
+ if (IS_ERR(fc)) {
+ mnt = ERR_CAST(fc);
+ } else {
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
+ ctx->hstate = h;
+ mnt = fc_mount(fc);
+ put_fs_context(fc);
+ }
+ if (IS_ERR(mnt))
+ pr_err("Cannot mount internal hugetlbfs for page size %uK",
+ 1U << (h->order + PAGE_SHIFT - 10));
+ return mnt;
+}
+
static int __init init_hugetlbfs_fs(void)
{
+ struct vfsmount *mnt;
struct hstate *h;
int error;
int i;
@@ -1408,24 +1466,16 @@ static int __init init_hugetlbfs_fs(void)
i = 0;
for_each_hstate(h) {
- char buf[50];
- unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
-
- snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
- hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
- buf);
-
- if (IS_ERR(hugetlbfs_vfsmount[i])) {
- pr_err("Cannot mount internal hugetlbfs for "
- "page size %uK", ps_kb);
- error = PTR_ERR(hugetlbfs_vfsmount[i]);
- hugetlbfs_vfsmount[i] = NULL;
+ mnt = mount_one_hugetlbfs(h);
+ if (IS_ERR(mnt) && i == 0) {
+ error = PTR_ERR(mnt);
+ goto out;
}
+ hugetlbfs_vfsmount[i] = mnt;
i++;
}
- /* Non default hstates are optional */
- if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
- return 0;
+
+ return 0;
out:
kmem_cache_destroy(hugetlbfs_inode_cachep);
diff --git a/fs/inode.c b/fs/inode.c
index e9d97add2b36..9a453f3637f8 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1817,8 +1817,13 @@ int file_remove_privs(struct file *file)
int kill;
int error = 0;
- /* Fast path for nothing security related */
- if (IS_NOSEC(inode))
+ /*
+ * Fast path for nothing security related.
+ * As well for non-regular files, e.g. blkdev inodes.
+ * For example, blkdev_write_iter() might get here
+ * trying to remove privs which it is not allowed to.
+ */
+ if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
kill = dentry_needs_remove_privs(dentry);
diff --git a/fs/internal.h b/fs/internal.h
index d410186bc369..6a8b71643af4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -17,6 +17,7 @@ struct linux_binprm;
struct path;
struct mount;
struct shrink_control;
+struct fs_context;
/*
* block_dev.c
@@ -52,8 +53,16 @@ int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
extern void __init chrdev_init(void);
/*
+ * fs_context.c
+ */
+extern int parse_monolithic_mount_data(struct fs_context *, void *);
+extern void fc_drop_locked(struct fs_context *);
+
+/*
* namei.c
*/
+extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ struct path *path, struct path *root);
extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
@@ -99,10 +108,8 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
/*
* super.c
*/
-extern int do_remount_sb(struct super_block *, int, void *, int);
+extern int reconfigure_super(struct fs_context *);
extern bool trylock_super(struct super_block *sb);
-extern struct dentry *mount_fs(struct file_system_type *,
- int, const char *, void *);
extern struct super_block *user_get_super(dev_t);
/*
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5d99376d2369..84efb8956734 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4,15 +4,28 @@
* supporting fast/efficient IO.
*
* A note on the read/write ordering memory barriers that are matched between
- * the application and kernel side. When the application reads the CQ ring
- * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
- * the kernel uses after writing the tail. Failure to do so could cause a
- * delay in when the application notices that completion events available.
- * This isn't a fatal condition. Likewise, the application must use an
- * appropriate smp_wmb() both before writing the SQ tail, and after writing
- * the SQ tail. The first one orders the sqe writes with the tail write, and
- * the latter is paired with the smp_rmb() the kernel will issue before
- * reading the SQ tail on submission.
+ * the application and kernel side.
+ *
+ * After the application reads the CQ ring tail, it must use an
+ * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
+ * before writing the tail (using smp_load_acquire to read the tail will
+ * do). It also needs a smp_mb() before updating CQ head (ordering the
+ * entry load(s) with the head store), pairing with an implicit barrier
+ * through a control-dependency in io_get_cqring (smp_store_release to
+ * store head will do). Failure to do so could lead to reading invalid
+ * CQ entries.
+ *
+ * Likewise, the application must use an appropriate smp_wmb() before
+ * writing the SQ tail (ordering SQ entry stores with the tail store),
+ * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
+ * to store the tail will do). And it needs a barrier ordering the SQ
+ * head load before writing new SQ entries (smp_load_acquire to read
+ * head will do).
+ *
+ * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
+ * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
+ * updating the SQ tail; a full memory barrier smp_mb() is needed
+ * between.
*
* Also see the examples in the liburing library:
*
@@ -70,20 +83,108 @@ struct io_uring {
u32 tail ____cacheline_aligned_in_smp;
};
+/*
+ * This data is shared with the application through the mmap at offset
+ * IORING_OFF_SQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_sqring_offsets when calling io_uring_setup.
+ */
struct io_sq_ring {
+ /*
+ * Head and tail offsets into the ring; the offsets need to be
+ * masked to get valid indices.
+ *
+ * The kernel controls head and the application controls tail.
+ */
struct io_uring r;
+ /*
+ * Bitmask to apply to head and tail offsets (constant, equals
+ * ring_entries - 1)
+ */
u32 ring_mask;
+ /* Ring size (constant, power of 2) */
u32 ring_entries;
+ /*
+ * Number of invalid entries dropped by the kernel due to
+ * invalid index stored in array
+ *
+ * Written by the kernel, shouldn't be modified by the
+ * application (i.e. get number of "new events" by comparing to
+ * cached value).
+ *
+ * After a new SQ head value was read by the application this
+ * counter includes all submissions that were dropped reaching
+ * the new SQ head (and possibly more).
+ */
u32 dropped;
+ /*
+ * Runtime flags
+ *
+ * Written by the kernel, shouldn't be modified by the
+ * application.
+ *
+ * The application needs a full memory barrier before checking
+ * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
+ */
u32 flags;
+ /*
+ * Ring buffer of indices into array of io_uring_sqe, which is
+ * mmapped by the application using the IORING_OFF_SQES offset.
+ *
+ * This indirection could e.g. be used to assign fixed
+ * io_uring_sqe entries to operations and only submit them to
+ * the queue when needed.
+ *
+ * The kernel modifies neither the indices array nor the entries
+ * array.
+ */
u32 array[];
};
+/*
+ * This data is shared with the application through the mmap at offset
+ * IORING_OFF_CQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_cqring_offsets when calling io_uring_setup.
+ */
struct io_cq_ring {
+ /*
+ * Head and tail offsets into the ring; the offsets need to be
+ * masked to get valid indices.
+ *
+ * The application controls head and the kernel tail.
+ */
struct io_uring r;
+ /*
+ * Bitmask to apply to head and tail offsets (constant, equals
+ * ring_entries - 1)
+ */
u32 ring_mask;
+ /* Ring size (constant, power of 2) */
u32 ring_entries;
+ /*
+ * Number of completion events lost because the queue was full;
+ * this should be avoided by the application by making sure
+ * there are not more requests pending thatn there is space in
+ * the completion queue.
+ *
+ * Written by the kernel, shouldn't be modified by the
+ * application (i.e. get number of "new events" by comparing to
+ * cached value).
+ *
+ * As completion events come in out of order this counter is not
+ * ordered with any other data.
+ */
u32 overflow;
+ /*
+ * Ring buffer of completion events.
+ *
+ * The kernel writes completion events fresh every time they are
+ * produced, so the application is allowed to modify pending
+ * entries.
+ */
struct io_uring_cqe cqes[];
};
@@ -189,17 +290,28 @@ struct sqe_submit {
bool needs_fixed_file;
};
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
struct io_poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
- bool woken;
+ bool done;
bool canceled;
struct wait_queue_entry wait;
};
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
struct io_kiocb {
union {
+ struct file *file;
struct kiocb rw;
struct io_poll_iocb poll;
};
@@ -210,10 +322,11 @@ struct io_kiocb {
struct list_head list;
unsigned int flags;
refcount_t refs;
-#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
+#define REQ_F_NOWAIT 1 /* must not punt to workers */
#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
#define REQ_F_FIXED_FILE 4 /* ctx owns file */
#define REQ_F_SEQ_PREV 8 /* sequential with previous */
+#define REQ_F_PREPPED 16 /* prep already done */
u64 user_data;
u64 error;
@@ -305,12 +418,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
/* order cqe stores with ring update */
smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
- /*
- * Write sider barrier of tail update, app has read side. See
- * comment at the top of this file.
- */
- smp_wmb();
-
if (wq_has_sleeper(&ctx->cq_wait)) {
wake_up_interruptible(&ctx->cq_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
@@ -324,9 +431,12 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
unsigned tail;
tail = ctx->cached_cq_tail;
- /* See comment at the top of the file */
- smp_rmb();
- if (tail + 1 == READ_ONCE(ring->r.head))
+ /*
+ * writes to the cq entry need to come after reading head; the
+ * control dependency is enough as we're using WRITE_ONCE to
+ * fill the cq entry
+ */
+ if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
return NULL;
ctx->cached_cq_tail++;
@@ -355,20 +465,25 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
}
}
-static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->sqo_wait))
+ wake_up(&ctx->sqo_wait);
+}
+
+static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
long res, unsigned ev_flags)
{
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
+ io_cqring_fill_event(ctx, user_data, res, ev_flags);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
- if (waitqueue_active(&ctx->sqo_wait))
- wake_up(&ctx->sqo_wait);
+ io_cqring_ev_posted(ctx);
}
static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
@@ -382,13 +497,14 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
+ gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
if (!percpu_ref_tryget(&ctx->refs))
return NULL;
if (!state) {
- req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
+ req = kmem_cache_alloc(req_cachep, gfp);
if (unlikely(!req))
goto out;
} else if (!state->free_reqs) {
@@ -396,10 +512,18 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
int ret;
sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
- ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
- state->reqs);
- if (unlikely(ret <= 0))
- goto out;
+ ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
+
+ /*
+ * Bulk alloc is all-or-nothing. If we fail to get a batch,
+ * retry single alloc to be on the safe side.
+ */
+ if (unlikely(ret <= 0)) {
+ state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+ if (!state->reqs[0])
+ goto out;
+ ret = 1;
+ }
state->free_reqs = ret - 1;
state->cur_req = 1;
req = state->reqs[0];
@@ -411,7 +535,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
req->ctx = ctx;
req->flags = 0;
- refcount_set(&req->refs, 0);
+ /* one is dropped after submission, the other at completion */
+ refcount_set(&req->refs, 2);
return req;
out:
io_ring_drop_ctx_refs(ctx, 1);
@@ -429,10 +554,16 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
static void io_free_req(struct io_kiocb *req)
{
- if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) {
- io_ring_drop_ctx_refs(req->ctx, 1);
- kmem_cache_free(req_cachep, req);
- }
+ if (req->file && !(req->flags & REQ_F_FIXED_FILE))
+ fput(req->file);
+ io_ring_drop_ctx_refs(req->ctx, 1);
+ kmem_cache_free(req_cachep, req);
+}
+
+static void io_put_req(struct io_kiocb *req)
+{
+ if (refcount_dec_and_test(&req->refs))
+ io_free_req(req);
}
/*
@@ -442,44 +573,34 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
struct list_head *done)
{
void *reqs[IO_IOPOLL_BATCH];
- int file_count, to_free;
- struct file *file = NULL;
struct io_kiocb *req;
+ int to_free;
- file_count = to_free = 0;
+ to_free = 0;
while (!list_empty(done)) {
req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list);
io_cqring_fill_event(ctx, req->user_data, req->error, 0);
-
- reqs[to_free++] = req;
(*nr_events)++;
- /*
- * Batched puts of the same file, to avoid dirtying the
- * file usage count multiple times, if avoidable.
- */
- if (!(req->flags & REQ_F_FIXED_FILE)) {
- if (!file) {
- file = req->rw.ki_filp;
- file_count = 1;
- } else if (file == req->rw.ki_filp) {
- file_count++;
+ if (refcount_dec_and_test(&req->refs)) {
+ /* If we're not using fixed files, we have to pair the
+ * completion part with the file put. Use regular
+ * completions for those, only batch free for fixed
+ * file.
+ */
+ if (req->flags & REQ_F_FIXED_FILE) {
+ reqs[to_free++] = req;
+ if (to_free == ARRAY_SIZE(reqs))
+ io_free_req_many(ctx, reqs, &to_free);
} else {
- fput_many(file, file_count);
- file = req->rw.ki_filp;
- file_count = 1;
+ io_free_req(req);
}
}
-
- if (to_free == ARRAY_SIZE(reqs))
- io_free_req_many(ctx, reqs, &to_free);
}
- io_commit_cqring(ctx);
- if (file)
- fput_many(file, file_count);
+ io_commit_cqring(ctx);
io_free_req_many(ctx, reqs, &to_free);
}
@@ -602,21 +723,14 @@ static void kiocb_end_write(struct kiocb *kiocb)
}
}
-static void io_fput(struct io_kiocb *req)
-{
- if (!(req->flags & REQ_F_FIXED_FILE))
- fput(req->rw.ki_filp);
-}
-
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
kiocb_end_write(kiocb);
- io_fput(req);
io_cqring_add_event(req->ctx, req->user_data, res, 0);
- io_free_req(req);
+ io_put_req(req);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -666,11 +780,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
list_add_tail(&req->list, &ctx->poll_list);
}
-static void io_file_put(struct io_submit_state *state, struct file *file)
+static void io_file_put(struct io_submit_state *state)
{
- if (!state) {
- fput(file);
- } else if (state->file) {
+ if (state->file) {
int diff = state->has_refs - state->used_refs;
if (diff)
@@ -695,7 +807,7 @@ static struct file *io_file_get(struct io_submit_state *state, int fd)
state->ios_left--;
return state->file;
}
- io_file_put(state, NULL);
+ io_file_put(state);
}
state->file = fget_many(fd, state->ios_left);
if (!state->file)
@@ -726,36 +838,23 @@ static bool io_file_supports_async(struct file *file)
}
static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
- bool force_nonblock, struct io_submit_state *state)
+ bool force_nonblock)
{
const struct io_uring_sqe *sqe = s->sqe;
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw;
- unsigned ioprio, flags;
- int fd, ret;
+ unsigned ioprio;
+ int ret;
+ if (!req->file)
+ return -EBADF;
/* For -EAGAIN retry, everything is already prepped */
- if (kiocb->ki_filp)
+ if (req->flags & REQ_F_PREPPED)
return 0;
- flags = READ_ONCE(sqe->flags);
- fd = READ_ONCE(sqe->fd);
+ if (force_nonblock && !io_file_supports_async(req->file))
+ force_nonblock = false;
- if (flags & IOSQE_FIXED_FILE) {
- if (unlikely(!ctx->user_files ||
- (unsigned) fd >= ctx->nr_user_files))
- return -EBADF;
- kiocb->ki_filp = ctx->user_files[fd];
- req->flags |= REQ_F_FIXED_FILE;
- } else {
- if (s->needs_fixed_file)
- return -EBADF;
- kiocb->ki_filp = io_file_get(state, fd);
- if (unlikely(!kiocb->ki_filp))
- return -EBADF;
- if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
- force_nonblock = false;
- }
kiocb->ki_pos = READ_ONCE(sqe->off);
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -764,7 +863,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
if (ioprio) {
ret = ioprio_check_cap(ioprio);
if (ret)
- goto out_fput;
+ return ret;
kiocb->ki_ioprio = ioprio;
} else
@@ -772,38 +871,30 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
if (unlikely(ret))
- goto out_fput;
- if (force_nonblock) {
+ return ret;
+
+ /* don't allow async punt if RWF_NOWAIT was requested */
+ if (kiocb->ki_flags & IOCB_NOWAIT)
+ req->flags |= REQ_F_NOWAIT;
+
+ if (force_nonblock)
kiocb->ki_flags |= IOCB_NOWAIT;
- req->flags |= REQ_F_FORCE_NONBLOCK;
- }
+
if (ctx->flags & IORING_SETUP_IOPOLL) {
- ret = -EOPNOTSUPP;
if (!(kiocb->ki_flags & IOCB_DIRECT) ||
!kiocb->ki_filp->f_op->iopoll)
- goto out_fput;
+ return -EOPNOTSUPP;
req->error = 0;
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
} else {
- if (kiocb->ki_flags & IOCB_HIPRI) {
- ret = -EINVAL;
- goto out_fput;
- }
+ if (kiocb->ki_flags & IOCB_HIPRI)
+ return -EINVAL;
kiocb->ki_complete = io_complete_rw;
}
+ req->flags |= REQ_F_PREPPED;
return 0;
-out_fput:
- if (!(flags & IOSQE_FIXED_FILE)) {
- /*
- * in case of error, we didn't use this file reference. drop it.
- */
- if (state)
- state->used_refs--;
- io_file_put(state, kiocb->ki_filp);
- }
- return ret;
}
static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
@@ -864,6 +955,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
if (offset)
iov_iter_advance(iter, offset);
+
+ /* don't drop a reference to these pages */
+ iter->type |= ITER_BVEC_FLAG_NO_REF;
return 0;
}
@@ -887,7 +981,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
opcode = READ_ONCE(sqe->opcode);
if (opcode == IORING_OP_READ_FIXED ||
opcode == IORING_OP_WRITE_FIXED) {
- ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+ int ret = io_import_fixed(ctx, rw, sqe, iter);
*iovec = NULL;
return ret;
}
@@ -923,7 +1017,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
/* Use 8x RA size as a decent limiter for both reads/writes */
max_pages = filp->f_ra.ra_pages;
if (!max_pages)
- max_pages = VM_MAX_READAHEAD >> (PAGE_SHIFT - 10);
+ max_pages = VM_READAHEAD_PAGES;
max_pages *= 8;
/* If max pages are exceeded, reset the state */
@@ -945,31 +1039,29 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
async_list->io_end = io_end;
}
-static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
- bool force_nonblock, struct io_submit_state *state)
+static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw;
struct iov_iter iter;
struct file *file;
size_t iov_count;
- ssize_t ret;
+ int ret;
- ret = io_prep_rw(req, s, force_nonblock, state);
+ ret = io_prep_rw(req, s, force_nonblock);
if (ret)
return ret;
file = kiocb->ki_filp;
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
- goto out_fput;
- ret = -EINVAL;
+ return -EBADF;
if (unlikely(!file->f_op->read_iter))
- goto out_fput;
+ return -EINVAL;
ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
if (ret)
- goto out_fput;
+ return ret;
iov_count = iov_iter_count(&iter);
ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
@@ -991,38 +1083,32 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
}
}
kfree(iovec);
-out_fput:
- /* Hold on to the file for -EAGAIN */
- if (unlikely(ret && ret != -EAGAIN))
- io_fput(req);
return ret;
}
-static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
- bool force_nonblock, struct io_submit_state *state)
+static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw;
struct iov_iter iter;
struct file *file;
size_t iov_count;
- ssize_t ret;
+ int ret;
- ret = io_prep_rw(req, s, force_nonblock, state);
+ ret = io_prep_rw(req, s, force_nonblock);
if (ret)
return ret;
- ret = -EBADF;
file = kiocb->ki_filp;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
- goto out_fput;
- ret = -EINVAL;
+ return -EBADF;
if (unlikely(!file->f_op->write_iter))
- goto out_fput;
+ return -EINVAL;
ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
if (ret)
- goto out_fput;
+ return ret;
iov_count = iov_iter_count(&iter);
@@ -1036,6 +1122,8 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
if (!ret) {
+ ssize_t ret2;
+
/*
* Open-code file_start_write here to grab freeze protection,
* which will be released by another thread in
@@ -1050,14 +1138,22 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
SB_FREEZE_WRITE);
}
kiocb->ki_flags |= IOCB_WRITE;
- io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
+
+ ret2 = call_write_iter(file, kiocb, &iter);
+ if (!force_nonblock || ret2 != -EAGAIN) {
+ io_rw_done(kiocb, ret2);
+ } else {
+ /*
+ * If ->needs_lock is true, we're already in async
+ * context.
+ */
+ if (!s->needs_lock)
+ io_async_list_note(WRITE, req, iov_count);
+ ret = -EAGAIN;
+ }
}
out_free:
kfree(iovec);
-out_fput:
- /* Hold on to the file for -EAGAIN */
- if (unlikely(ret && ret != -EAGAIN))
- io_fput(req);
return ret;
}
@@ -1072,29 +1168,19 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- /*
- * Twilight zone - it's possible that someone issued an opcode that
- * has a file attached, then got -EAGAIN on submission, and changed
- * the sqe before we retried it from async context. Avoid dropping
- * a file reference for this malicious case, and flag the error.
- */
- if (req->rw.ki_filp) {
- err = -EBADF;
- io_fput(req);
- }
io_cqring_add_event(ctx, user_data, err, 0);
- io_free_req(req);
+ io_put_req(req);
return 0;
}
static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned flags;
- int fd;
- /* Prep already done */
- if (req->rw.ki_filp)
+ if (!req->file)
+ return -EBADF;
+ /* Prep already done (EAGAIN retry) */
+ if (req->flags & REQ_F_PREPPED)
return 0;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
@@ -1102,20 +1188,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
return -EINVAL;
- fd = READ_ONCE(sqe->fd);
- flags = READ_ONCE(sqe->flags);
-
- if (flags & IOSQE_FIXED_FILE) {
- if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
- return -EBADF;
- req->rw.ki_filp = ctx->user_files[fd];
- req->flags |= REQ_F_FIXED_FILE;
- } else {
- req->rw.ki_filp = fget(fd);
- if (unlikely(!req->rw.ki_filp))
- return -EBADF;
- }
-
+ req->flags |= REQ_F_PREPPED;
return 0;
}
@@ -1144,9 +1217,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
end > 0 ? end : LLONG_MAX,
fsync_flags & IORING_FSYNC_DATASYNC);
- io_fput(req);
io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
- io_free_req(req);
+ io_put_req(req);
return 0;
}
@@ -1204,15 +1276,16 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
spin_unlock_irq(&ctx->completion_lock);
io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
- io_free_req(req);
+ io_put_req(req);
return 0;
}
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ __poll_t mask)
{
- io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
- io_fput(req);
- io_free_req(req);
+ req->poll.done = true;
+ io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0);
+ io_commit_cqring(ctx);
}
static void io_poll_complete_work(struct work_struct *work)
@@ -1240,9 +1313,11 @@ static void io_poll_complete_work(struct work_struct *work)
return;
}
list_del_init(&req->list);
+ io_poll_complete(ctx, req, mask);
spin_unlock_irq(&ctx->completion_lock);
- io_poll_complete(req, mask);
+ io_cqring_ev_posted(ctx);
+ io_put_req(req);
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -1253,29 +1328,25 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
struct io_ring_ctx *ctx = req->ctx;
__poll_t mask = key_to_poll(key);
-
- poll->woken = true;
+ unsigned long flags;
/* for instances that support it check for an event match first: */
- if (mask) {
- unsigned long flags;
+ if (mask && !(mask & poll->events))
+ return 0;
- if (!(mask & poll->events))
- return 0;
+ list_del_init(&poll->wait.entry);
- /* try to complete the iocb inline if we can: */
- if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
- list_del(&req->list);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+ list_del(&req->list);
+ io_poll_complete(ctx, req, mask);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
- list_del_init(&poll->wait.entry);
- io_poll_complete(req, mask);
- return 1;
- }
+ io_cqring_ev_posted(ctx);
+ io_put_req(req);
+ } else {
+ queue_work(ctx->sqo_wq, &req->work);
}
- list_del_init(&poll->wait.entry);
- queue_work(ctx->sqo_wq, &req->work);
return 1;
}
@@ -1305,36 +1376,23 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_poll_iocb *poll = &req->poll;
struct io_ring_ctx *ctx = req->ctx;
struct io_poll_table ipt;
- unsigned flags;
+ bool cancel = false;
__poll_t mask;
u16 events;
- int fd;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
return -EINVAL;
+ if (!poll->file)
+ return -EBADF;
INIT_WORK(&req->work, io_poll_complete_work);
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
- flags = READ_ONCE(sqe->flags);
- fd = READ_ONCE(sqe->fd);
-
- if (flags & IOSQE_FIXED_FILE) {
- if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
- return -EBADF;
- poll->file = ctx->user_files[fd];
- req->flags |= REQ_F_FIXED_FILE;
- } else {
- poll->file = fget(fd);
- }
- if (unlikely(!poll->file))
- return -EBADF;
-
poll->head = NULL;
- poll->woken = false;
+ poll->done = false;
poll->canceled = false;
ipt.pt._qproc = io_poll_queue_proc;
@@ -1346,56 +1404,43 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
INIT_LIST_HEAD(&poll->wait.entry);
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
- /* one for removal from waitqueue, one for this function */
- refcount_set(&req->refs, 2);
-
mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
- if (unlikely(!poll->head)) {
- /* we did not manage to set up a waitqueue, done */
- goto out;
- }
spin_lock_irq(&ctx->completion_lock);
- spin_lock(&poll->head->lock);
- if (poll->woken) {
- /* wake_up context handles the rest */
- mask = 0;
+ if (likely(poll->head)) {
+ spin_lock(&poll->head->lock);
+ if (unlikely(list_empty(&poll->wait.entry))) {
+ if (ipt.error)
+ cancel = true;
+ ipt.error = 0;
+ mask = 0;
+ }
+ if (mask || ipt.error)
+ list_del_init(&poll->wait.entry);
+ else if (cancel)
+ WRITE_ONCE(poll->canceled, true);
+ else if (!poll->done) /* actually waiting for an event */
+ list_add_tail(&req->list, &ctx->cancel_list);
+ spin_unlock(&poll->head->lock);
+ }
+ if (mask) { /* no async, we'd stolen it */
+ req->error = mangle_poll(mask);
ipt.error = 0;
- } else if (mask || ipt.error) {
- /* if we get an error or a mask we are done */
- WARN_ON_ONCE(list_empty(&poll->wait.entry));
- list_del_init(&poll->wait.entry);
- } else {
- /* actually waiting for an event */
- list_add_tail(&req->list, &ctx->cancel_list);
+ io_poll_complete(ctx, req, mask);
}
- spin_unlock(&poll->head->lock);
spin_unlock_irq(&ctx->completion_lock);
-out:
- if (unlikely(ipt.error)) {
- if (!(flags & IOSQE_FIXED_FILE))
- fput(poll->file);
- /*
- * Drop one of our refs to this req, __io_submit_sqe() will
- * drop the other one since we're returning an error.
- */
- io_free_req(req);
- return ipt.error;
+ if (mask) {
+ io_cqring_ev_posted(ctx);
+ io_put_req(req);
}
-
- if (mask)
- io_poll_complete(req, mask);
- io_free_req(req);
- return 0;
+ return ipt.error;
}
static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct sqe_submit *s, bool force_nonblock,
- struct io_submit_state *state)
+ const struct sqe_submit *s, bool force_nonblock)
{
- ssize_t ret;
- int opcode;
+ int ret, opcode;
if (unlikely(s->index >= ctx->sq_entries))
return -EINVAL;
@@ -1409,18 +1454,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
case IORING_OP_READV:
if (unlikely(s->sqe->buf_index))
return -EINVAL;
- ret = io_read(req, s, force_nonblock, state);
+ ret = io_read(req, s, force_nonblock);
break;
case IORING_OP_WRITEV:
if (unlikely(s->sqe->buf_index))
return -EINVAL;
- ret = io_write(req, s, force_nonblock, state);
+ ret = io_write(req, s, force_nonblock);
break;
case IORING_OP_READ_FIXED:
- ret = io_read(req, s, force_nonblock, state);
+ ret = io_read(req, s, force_nonblock);
break;
case IORING_OP_WRITE_FIXED:
- ret = io_write(req, s, force_nonblock, state);
+ ret = io_write(req, s, force_nonblock);
break;
case IORING_OP_FSYNC:
ret = io_fsync(req, s->sqe, force_nonblock);
@@ -1493,8 +1538,7 @@ restart:
struct sqe_submit *s = &req->submit;
const struct io_uring_sqe *sqe = s->sqe;
- /* Ensure we clear previously set forced non-block flag */
- req->flags &= ~REQ_F_FORCE_NONBLOCK;
+ /* Ensure we clear previously set non-block flag */
req->rw.ki_flags &= ~IOCB_NOWAIT;
ret = 0;
@@ -1513,7 +1557,7 @@ restart:
s->has_user = cur_mm != NULL;
s->needs_lock = true;
do {
- ret = __io_submit_sqe(ctx, req, s, false, NULL);
+ ret = __io_submit_sqe(ctx, req, s, false);
/*
* We can get EAGAIN for polled IO even though
* we're forcing a sync submission from here,
@@ -1525,9 +1569,13 @@ restart:
cond_resched();
} while (1);
}
+
+ /* drop submission reference */
+ io_put_req(req);
+
if (ret) {
io_cqring_add_event(ctx, sqe->user_data, ret, 0);
- io_free_req(req);
+ io_put_req(req);
}
/* async context always use a copy of the sqe */
@@ -1614,11 +1662,55 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
return ret;
}
+static bool io_op_needs_file(const struct io_uring_sqe *sqe)
+{
+ int op = READ_ONCE(sqe->opcode);
+
+ switch (op) {
+ case IORING_OP_NOP:
+ case IORING_OP_POLL_REMOVE:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
+ struct io_submit_state *state, struct io_kiocb *req)
+{
+ unsigned flags;
+ int fd;
+
+ flags = READ_ONCE(s->sqe->flags);
+ fd = READ_ONCE(s->sqe->fd);
+
+ if (!io_op_needs_file(s->sqe)) {
+ req->file = NULL;
+ return 0;
+ }
+
+ if (flags & IOSQE_FIXED_FILE) {
+ if (unlikely(!ctx->user_files ||
+ (unsigned) fd >= ctx->nr_user_files))
+ return -EBADF;
+ req->file = ctx->user_files[fd];
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ if (s->needs_fixed_file)
+ return -EBADF;
+ req->file = io_file_get(state, fd);
+ if (unlikely(!req->file))
+ return -EBADF;
+ }
+
+ return 0;
+}
+
static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
struct io_submit_state *state)
{
struct io_kiocb *req;
- ssize_t ret;
+ int ret;
/* enforce forwards compatibility on users */
if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
@@ -1628,10 +1720,12 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
if (unlikely(!req))
return -EAGAIN;
- req->rw.ki_filp = NULL;
+ ret = io_req_set_file(ctx, s, state, req);
+ if (unlikely(ret))
+ goto out;
- ret = __io_submit_sqe(ctx, req, s, true, state);
- if (ret == -EAGAIN) {
+ ret = __io_submit_sqe(ctx, req, s, true);
+ if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
struct io_uring_sqe *sqe_copy;
sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
@@ -1649,11 +1743,23 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
INIT_WORK(&req->work, io_sq_wq_submit_work);
queue_work(ctx->sqo_wq, &req->work);
}
- ret = 0;
+
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually
+ * submitted.
+ */
+ return 0;
}
}
+
+out:
+ /* drop submission reference */
+ io_put_req(req);
+
+ /* and drop final reference, if we failed */
if (ret)
- io_free_req(req);
+ io_put_req(req);
return ret;
}
@@ -1664,7 +1770,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
static void io_submit_state_end(struct io_submit_state *state)
{
blk_finish_plug(&state->plug);
- io_file_put(state, NULL);
+ io_file_put(state);
if (state->free_reqs)
kmem_cache_free_bulk(req_cachep, state->free_reqs,
&state->reqs[state->cur_req]);
@@ -1693,24 +1799,10 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
* write new data to them.
*/
smp_store_release(&ring->r.head, ctx->cached_sq_head);
-
- /*
- * write side barrier of head update, app has read side. See
- * comment at the top of this file
- */
- smp_wmb();
}
}
/*
- * Undo last io_get_sqring()
- */
-static void io_drop_sqring(struct io_ring_ctx *ctx)
-{
- ctx->cached_sq_head--;
-}
-
-/*
* Fetch an sqe, if one is available. Note that s->sqe will point to memory
* that is mapped by userspace. This means that care needs to be taken to
* ensure that reads are stable, as we cannot rely on userspace always
@@ -1732,9 +1824,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
* though the application is the one updating it.
*/
head = ctx->cached_sq_head;
- /* See comment at the top of this file */
- smp_rmb();
- if (head == READ_ONCE(ring->r.tail))
+ /* make sure SQ entry isn't read before tail */
+ if (head == smp_load_acquire(&ring->r.tail))
return false;
head = READ_ONCE(ring->array[head & ctx->sq_mask]);
@@ -1748,8 +1839,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
/* drop invalid entries */
ctx->cached_sq_head++;
ring->dropped++;
- /* See comment at the top of this file */
- smp_wmb();
return false;
}
@@ -1859,7 +1948,8 @@ static int io_sq_thread(void *data)
/* Tell userspace we may need a wakeup call */
ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
- smp_wmb();
+ /* make sure to read SQ tail after writing flags */
+ smp_mb();
if (!io_get_sqring(ctx, &sqes[0])) {
if (kthread_should_stop()) {
@@ -1872,13 +1962,11 @@ static int io_sq_thread(void *data)
finish_wait(&ctx->sqo_wait, &wait);
ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
- smp_wmb();
continue;
}
finish_wait(&ctx->sqo_wait, &wait);
ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
- smp_wmb();
}
i = 0;
@@ -1913,13 +2001,17 @@ static int io_sq_thread(void *data)
unuse_mm(cur_mm);
mmput(cur_mm);
}
+
+ if (kthread_should_park())
+ kthread_parkme();
+
return 0;
}
static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
{
struct io_submit_state state, *statep = NULL;
- int i, ret = 0, submit = 0;
+ int i, submit = 0;
if (to_submit > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, ctx, to_submit);
@@ -1928,6 +2020,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
for (i = 0; i < to_submit; i++) {
struct sqe_submit s;
+ int ret;
if (!io_get_sqring(ctx, &s))
break;
@@ -1935,21 +2028,18 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
s.has_user = true;
s.needs_lock = false;
s.needs_fixed_file = false;
+ submit++;
ret = io_submit_sqe(ctx, &s, statep);
- if (ret) {
- io_drop_sqring(ctx);
- break;
- }
-
- submit++;
+ if (ret)
+ io_cqring_add_event(ctx, s.sqe->user_data, ret, 0);
}
io_commit_sqring(ctx);
if (statep)
io_submit_state_end(statep);
- return submit ? submit : ret;
+ return submit;
}
static unsigned io_cqring_events(struct io_cq_ring *ring)
@@ -1975,7 +2065,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return 0;
if (sig) {
- ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall())
+ ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
+ &ksigmask, &sigsaved, sigsz);
+ else
+#endif
+ ret = set_user_sigmask(sig, &ksigmask,
+ &sigsaved, sigsz);
+
if (ret)
return ret;
}
@@ -2039,6 +2137,7 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
if (ctx->sqo_thread) {
ctx->sqo_stop = 1;
mb();
+ kthread_park(ctx->sqo_thread);
kthread_stop(ctx->sqo_thread);
ctx->sqo_thread = NULL;
}
@@ -2200,6 +2299,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
fput(ctx->user_files[i]);
kfree(ctx->user_files);
+ ctx->user_files = NULL;
ctx->nr_user_files = 0;
return ret;
}
@@ -2220,19 +2320,23 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
mmgrab(current->mm);
ctx->sqo_mm = current->mm;
- ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
- if (!ctx->sq_thread_idle)
- ctx->sq_thread_idle = HZ;
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ ret = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto err;
- ret = -EINVAL;
- if (!cpu_possible(p->sq_thread_cpu))
- goto err;
+ ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
+ if (!ctx->sq_thread_idle)
+ ctx->sq_thread_idle = HZ;
- if (ctx->flags & IORING_SETUP_SQPOLL) {
if (p->flags & IORING_SETUP_SQ_AFF) {
- int cpu;
+ int cpu = array_index_nospec(p->sq_thread_cpu,
+ nr_cpu_ids);
+
+ ret = -EINVAL;
+ if (!cpu_possible(cpu))
+ goto err;
- cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
ctx, cpu,
"io_uring-sq");
@@ -2293,8 +2397,12 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
static void io_mem_free(void *ptr)
{
- struct page *page = virt_to_head_page(ptr);
+ struct page *page;
+
+ if (!ptr)
+ return;
+ page = virt_to_head_page(ptr);
if (put_page_testzero(page))
free_compound_page(page);
}
@@ -2335,7 +2443,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
if (ctx->account_mem)
io_unaccount_mem(ctx->user, imu->nr_bvecs);
- kfree(imu->bvec);
+ kvfree(imu->bvec);
imu->nr_bvecs = 0;
}
@@ -2427,9 +2535,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
if (!pages || nr_pages > got_pages) {
kfree(vmas);
kfree(pages);
- pages = kmalloc_array(nr_pages, sizeof(struct page *),
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *),
GFP_KERNEL);
- vmas = kmalloc_array(nr_pages,
+ vmas = kvmalloc_array(nr_pages,
sizeof(struct vm_area_struct *),
GFP_KERNEL);
if (!pages || !vmas) {
@@ -2441,7 +2549,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
got_pages = nr_pages;
}
- imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+ imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
GFP_KERNEL);
ret = -ENOMEM;
if (!imu->bvec) {
@@ -2480,6 +2588,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
}
if (ctx->account_mem)
io_unaccount_mem(ctx->user, nr_pages);
+ kvfree(imu->bvec);
goto err;
}
@@ -2502,12 +2611,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
ctx->nr_user_bufs++;
}
- kfree(pages);
- kfree(vmas);
+ kvfree(pages);
+ kvfree(vmas);
return 0;
err:
- kfree(pages);
- kfree(vmas);
+ kvfree(pages);
+ kvfree(vmas);
io_sqe_buffer_unregister(ctx);
return ret;
}
@@ -2545,9 +2654,13 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
__poll_t mask = 0;
poll_wait(file, &ctx->cq_wait, wait);
- /* See comment at the top of this file */
+ /*
+ * synchronizes with barrier from wq_has_sleeper call in
+ * io_commit_cqring
+ */
smp_rmb();
- if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
+ if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head !=
+ ctx->sq_ring->ring_entries)
mask |= EPOLLOUT | EPOLLWRNORM;
if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
mask |= EPOLLIN | EPOLLRDNORM;
@@ -2658,24 +2771,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
mutex_lock(&ctx->uring_lock);
submitted = io_ring_submit(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
-
- if (submitted < 0)
- goto out_ctx;
}
if (flags & IORING_ENTER_GETEVENTS) {
unsigned nr_events = 0;
min_complete = min(min_complete, ctx->cq_entries);
- /*
- * The application could have included the 'to_submit' count
- * in how many events it wanted to wait for. If we failed to
- * submit the desired count, we may need to adjust the number
- * of events to poll/wait for.
- */
- if (submitted < to_submit)
- min_complete = min_t(unsigned, submitted, min_complete);
-
if (ctx->flags & IORING_SETUP_IOPOLL) {
mutex_lock(&ctx->uring_lock);
ret = io_iopoll_check(ctx, &nr_events, min_complete);
@@ -2721,17 +2822,12 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return -EOVERFLOW;
ctx->sq_sqes = io_mem_alloc(size);
- if (!ctx->sq_sqes) {
- io_mem_free(ctx->sq_ring);
+ if (!ctx->sq_sqes)
return -ENOMEM;
- }
cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
- if (!cq_ring) {
- io_mem_free(ctx->sq_ring);
- io_mem_free(ctx->sq_sqes);
+ if (!cq_ring)
return -ENOMEM;
- }
ctx->cq_ring = cq_ring;
cq_ring->ring_mask = p->cq_entries - 1;
@@ -2902,11 +2998,31 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
+ __releases(ctx->uring_lock)
+ __acquires(ctx->uring_lock)
{
int ret;
+ /*
+ * We're inside the ring mutex, if the ref is already dying, then
+ * someone else killed the ctx or is already going through
+ * io_uring_register().
+ */
+ if (percpu_ref_is_dying(&ctx->refs))
+ return -ENXIO;
+
percpu_ref_kill(&ctx->refs);
+
+ /*
+ * Drop uring mutex before waiting for references to exit. If another
+ * thread is currently inside io_uring_enter() it might need to grab
+ * the uring_lock to make progress. If we hold it here across the drain
+ * wait, then we can deadlock. It's safe to drop the mutex here, since
+ * no new references will come in after we've killed the percpu ref.
+ */
+ mutex_unlock(&ctx->uring_lock);
wait_for_completion(&ctx->ctx_done);
+ mutex_lock(&ctx->uring_lock);
switch (opcode) {
case IORING_REGISTER_BUFFERS:
diff --git a/fs/iomap.c b/fs/iomap.c
index 97cb9d486a7d..abdd18e404f8 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio)
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
- struct bio_vec *bvec;
- int i;
- struct bvec_iter_all iter_all;
+ if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+ struct bvec_iter_all iter_all;
+ struct bio_vec *bvec;
+ int i;
- bio_for_each_segment_all(bvec, bio, i, iter_all)
- put_page(bvec->bv_page);
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
+ put_page(bvec->bv_page);
+ }
bio_put(bio);
}
}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 26f8d7e46462..02e0b79753e7 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -113,7 +113,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
nblocks = jbd2_space_needed(journal);
while (jbd2_log_space_left(journal) < nblocks) {
write_unlock(&journal->j_state_lock);
- mutex_lock(&journal->j_checkpoint_mutex);
+ mutex_lock_io(&journal->j_checkpoint_mutex);
/*
* Test again, another process may have checkpointed while we
@@ -276,9 +276,22 @@ restart:
"JBD2: %s: Waiting for Godot: block %llu\n",
journal->j_devname, (unsigned long long) bh->b_blocknr);
+ if (batch_count)
+ __flush_batch(journal, &batch_count);
jbd2_log_start_commit(journal, tid);
+ /*
+ * jbd2_journal_commit_transaction() may want
+ * to take the checkpoint_mutex if JBD2_FLUSHED
+ * is set, jbd2_update_log_tail() called by
+ * jbd2_journal_commit_transaction() may also take
+ * checkpoint_mutex. So we need to temporarily
+ * drop it.
+ */
+ mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
- goto retry;
+ mutex_lock_io(&journal->j_checkpoint_mutex);
+ spin_lock(&journal->j_list_lock);
+ goto restart;
}
if (!buffer_dirty(bh)) {
if (unlikely(buffer_write_io_error(bh)) && !result)
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2eb55c3361a8..efd0ce9489ae 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -694,9 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
the last tag we set up. */
tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
-
- jbd2_descriptor_block_csum_set(journal, descriptor);
start_journal_io:
+ if (descriptor)
+ jbd2_descriptor_block_csum_set(journal,
+ descriptor);
+
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
/*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8ef6b6daaa7a..382c030cc78b 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -142,22 +142,6 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
return cpu_to_be32(csum);
}
-static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
-{
- if (!jbd2_journal_has_csum_v2or3(j))
- return 1;
-
- return sb->s_checksum == jbd2_superblock_csum(j, sb);
-}
-
-static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
-{
- if (!jbd2_journal_has_csum_v2or3(j))
- return;
-
- sb->s_checksum = jbd2_superblock_csum(j, sb);
-}
-
/*
* Helper function used to manage commit timeouts
*/
@@ -1356,6 +1340,10 @@ static int journal_reset(journal_t *journal)
return jbd2_journal_start_thread(journal);
}
+/*
+ * This function expects that the caller will have locked the journal
+ * buffer head, and will return with it unlocked
+ */
static int jbd2_write_superblock(journal_t *journal, int write_flags)
{
struct buffer_head *bh = journal->j_sb_buffer;
@@ -1365,7 +1353,6 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
trace_jbd2_write_superblock(journal, write_flags);
if (!(journal->j_flags & JBD2_BARRIER))
write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
- lock_buffer(bh);
if (buffer_write_io_error(bh)) {
/*
* Oh, dear. A previous attempt to write the journal
@@ -1381,7 +1368,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
clear_buffer_write_io_error(bh);
set_buffer_uptodate(bh);
}
- jbd2_superblock_csum_set(journal, sb);
+ if (jbd2_journal_has_csum_v2or3(journal))
+ sb->s_checksum = jbd2_superblock_csum(journal, sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
@@ -1424,6 +1412,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
tail_block, tail_tid);
+ lock_buffer(journal->j_sb_buffer);
sb->s_sequence = cpu_to_be32(tail_tid);
sb->s_start = cpu_to_be32(tail_block);
@@ -1454,18 +1443,17 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
journal_superblock_t *sb = journal->j_superblock;
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
- read_lock(&journal->j_state_lock);
- /* Is it already empty? */
- if (sb->s_start == 0) {
- read_unlock(&journal->j_state_lock);
+ lock_buffer(journal->j_sb_buffer);
+ if (sb->s_start == 0) { /* Is it already empty? */
+ unlock_buffer(journal->j_sb_buffer);
return;
}
+
jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
journal->j_tail_sequence);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
sb->s_start = cpu_to_be32(0);
- read_unlock(&journal->j_state_lock);
jbd2_write_superblock(journal, write_op);
@@ -1488,9 +1476,8 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
journal_superblock_t *sb = journal->j_superblock;
int errcode;
- read_lock(&journal->j_state_lock);
+ lock_buffer(journal->j_sb_buffer);
errcode = journal->j_errno;
- read_unlock(&journal->j_state_lock);
if (errcode == -ESHUTDOWN)
errcode = 0;
jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
@@ -1595,17 +1582,18 @@ static int journal_get_superblock(journal_t *journal)
}
}
- /* Check superblock checksum */
- if (!jbd2_superblock_csum_verify(journal, sb)) {
- printk(KERN_ERR "JBD2: journal checksum error\n");
- err = -EFSBADCRC;
- goto out;
- }
+ if (jbd2_journal_has_csum_v2or3(journal)) {
+ /* Check superblock checksum */
+ if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
+ printk(KERN_ERR "JBD2: journal checksum error\n");
+ err = -EFSBADCRC;
+ goto out;
+ }
- /* Precompute checksum seed for all metadata */
- if (jbd2_journal_has_csum_v2or3(journal))
+ /* Precompute checksum seed for all metadata */
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
sizeof(sb->s_uuid));
+ }
set_buffer_verified(bh);
@@ -1894,28 +1882,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb = journal->j_superblock;
+ /* Load the checksum driver if necessary */
+ if ((journal->j_chksum_driver == NULL) &&
+ INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+ journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(journal->j_chksum_driver)) {
+ printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
+ journal->j_chksum_driver = NULL;
+ return 0;
+ }
+ /* Precompute checksum seed for all metadata */
+ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+ sizeof(sb->s_uuid));
+ }
+
+ lock_buffer(journal->j_sb_buffer);
+
/* If enabling v3 checksums, update superblock */
if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
sb->s_feature_compat &=
~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
-
- /* Load the checksum driver */
- if (journal->j_chksum_driver == NULL) {
- journal->j_chksum_driver = crypto_alloc_shash("crc32c",
- 0, 0);
- if (IS_ERR(journal->j_chksum_driver)) {
- printk(KERN_ERR "JBD2: Cannot load crc32c "
- "driver.\n");
- journal->j_chksum_driver = NULL;
- return 0;
- }
-
- /* Precompute checksum seed for all metadata */
- journal->j_csum_seed = jbd2_chksum(journal, ~0,
- sb->s_uuid,
- sizeof(sb->s_uuid));
- }
}
/* If enabling v1 checksums, downgrade superblock */
@@ -1927,6 +1914,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb->s_feature_compat |= cpu_to_be32(compat);
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
+ unlock_buffer(journal->j_sb_buffer);
return 1;
#undef COMPAT_FEATURE_ON
@@ -2067,7 +2055,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
err = jbd2_journal_skip_recovery(journal);
if (write) {
/* Lock to make assertions happy... */
- mutex_lock(&journal->j_checkpoint_mutex);
+ mutex_lock_io(&journal->j_checkpoint_mutex);
jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cc35537232f2..f940d31c2adc 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -63,7 +63,7 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
- * Simply allocate and initialise a new transaction. Create it in
+ * Simply initialise a new transaction. Initialize it in
* RUNNING state and add it to the current journal (which should not
* have an existing running transaction: we only make a new transaction
* once we have started to commit the old one).
@@ -75,8 +75,8 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
*
*/
-static transaction_t *
-jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
+static void jbd2_get_transaction(journal_t *journal,
+ transaction_t *transaction)
{
transaction->t_journal = journal;
transaction->t_state = T_RUNNING;
@@ -100,8 +100,6 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_max_wait = 0;
transaction->t_start = jiffies;
transaction->t_requested = 0;
-
- return transaction;
}
/*
@@ -1252,11 +1250,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
char *committed_data = NULL;
- JBUFFER_TRACE(jh, "entry");
if (jbd2_write_access_granted(handle, bh, true))
return 0;
jh = jbd2_journal_add_journal_head(bh);
+ JBUFFER_TRACE(jh, "entry");
+
/*
* Do this first --- it can drop the journal lock, so we want to
* make sure that obtaining the committed_data is done
@@ -1367,15 +1366,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
if (is_handle_aborted(handle))
return -EROFS;
- if (!buffer_jbd(bh)) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (!buffer_jbd(bh))
+ return -EUCLEAN;
+
/*
* We don't grab jh reference here since the buffer must be part
* of the running transaction.
*/
jh = bh2jh(bh);
+ jbd_debug(5, "journal_head %p\n", jh);
+ JBUFFER_TRACE(jh, "entry");
+
/*
* This and the following assertions are unreliable since we may see jh
* in inconsistent state unless we grab bh_state lock. But this is
@@ -1409,9 +1410,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
}
journal = transaction->t_journal;
- jbd_debug(5, "journal_head %p\n", jh);
- JBUFFER_TRACE(jh, "entry");
-
jbd_lock_bh_state(bh);
if (jh->b_modified == 0) {
@@ -1597,9 +1595,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
__jbd2_journal_unfile_buffer(jh);
if (!buffer_jbd(bh)) {
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- __bforget(bh);
- goto drop;
+ goto not_jbd;
}
}
spin_unlock(&journal->j_list_lock);
@@ -1609,14 +1605,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
/* However, if the buffer is still owned by a prior
* (committing) transaction, we can't drop it yet... */
JBUFFER_TRACE(jh, "belongs to older transaction");
- /* ... but we CAN drop it from the new transaction if we
- * have also modified it since the original commit. */
+ /* ... but we CAN drop it from the new transaction through
+ * marking the buffer as freed and set j_next_transaction to
+ * the new transaction, so that not only the commit code
+ * knows it should clear dirty bits when it is done with the
+ * buffer, but also the buffer can be checkpointed only
+ * after the new transaction commits. */
- if (jh->b_next_transaction) {
- J_ASSERT(jh->b_next_transaction == transaction);
+ set_buffer_freed(bh);
+
+ if (!jh->b_next_transaction) {
spin_lock(&journal->j_list_lock);
- jh->b_next_transaction = NULL;
+ jh->b_next_transaction = transaction;
spin_unlock(&journal->j_list_lock);
+ } else {
+ J_ASSERT(jh->b_next_transaction == transaction);
/*
* only drop a reference if this transaction modified
@@ -1625,9 +1628,40 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
if (was_modified)
drop_reserve = 1;
}
+ } else {
+ /*
+ * Finally, if the buffer is not belongs to any
+ * transaction, we can just drop it now if it has no
+ * checkpoint.
+ */
+ spin_lock(&journal->j_list_lock);
+ if (!jh->b_cp_transaction) {
+ JBUFFER_TRACE(jh, "belongs to none transaction");
+ spin_unlock(&journal->j_list_lock);
+ goto not_jbd;
+ }
+
+ /*
+ * Otherwise, if the buffer has been written to disk,
+ * it is safe to remove the checkpoint and drop it.
+ */
+ if (!buffer_dirty(bh)) {
+ __jbd2_journal_remove_checkpoint(jh);
+ spin_unlock(&journal->j_list_lock);
+ goto not_jbd;
+ }
+
+ /*
+ * The buffer is still not written to disk, we should
+ * attach this buffer to current transaction so that the
+ * buffer can be checkpointed only after the current
+ * transaction commits.
+ */
+ clear_buffer_dirty(bh);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+ spin_unlock(&journal->j_list_lock);
}
-not_jbd:
jbd_unlock_bh_state(bh);
__brelse(bh);
drop:
@@ -1636,6 +1670,11 @@ drop:
handle->h_buffer_credits++;
}
return err;
+
+not_jbd:
+ jbd_unlock_bh_state(bh);
+ __bforget(bh);
+ goto drop;
}
/**
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 389ea53ea487..bccfc40b3a74 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1414,11 +1414,6 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f)
jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL);
- if (f->target) {
- kfree(f->target);
- f->target = NULL;
- }
-
fds = f->dents;
while(fds) {
fd = fds;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index bb6ae387469f..05d892c79339 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -47,7 +47,10 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
static void jffs2_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
+ struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+
+ kfree(f->target);
+ kmem_cache_free(jffs2_inode_cachep, f);
}
static void jffs2_destroy_inode(struct inode *inode)
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index dba810cd83b1..0b7d197a904c 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -17,6 +17,7 @@
#include <linux/xattr.h>
#include <linux/kernfs.h>
+#include <linux/fs_context.h>
struct kernfs_iattrs {
struct iattr ia_iattr;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f3ac352699cf..9a4646eecb71 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -22,16 +22,6 @@
struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
-static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
-{
- struct kernfs_root *root = kernfs_info(sb)->root;
- struct kernfs_syscall_ops *scops = root->syscall_ops;
-
- if (scops && scops->remount_fs)
- return scops->remount_fs(root, flags, data);
- return 0;
-}
-
static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
{
struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry));
@@ -60,7 +50,6 @@ const struct super_operations kernfs_sops = {
.drop_inode = generic_delete_inode,
.evict_inode = kernfs_evict_inode,
- .remount_fs = kernfs_sop_remount_fs,
.show_options = kernfs_sop_show_options,
.show_path = kernfs_sop_show_path,
};
@@ -222,7 +211,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
} while (true);
}
-static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
+static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
{
struct kernfs_super_info *info = kernfs_info(sb);
struct inode *inode;
@@ -233,7 +222,7 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
- sb->s_magic = magic;
+ sb->s_magic = kfc->magic;
sb->s_op = &kernfs_sops;
sb->s_xattr = kernfs_xattr_handlers;
if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP)
@@ -263,21 +252,20 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
return 0;
}
-static int kernfs_test_super(struct super_block *sb, void *data)
+static int kernfs_test_super(struct super_block *sb, struct fs_context *fc)
{
struct kernfs_super_info *sb_info = kernfs_info(sb);
- struct kernfs_super_info *info = data;
+ struct kernfs_super_info *info = fc->s_fs_info;
return sb_info->root == info->root && sb_info->ns == info->ns;
}
-static int kernfs_set_super(struct super_block *sb, void *data)
+static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
{
- int error;
- error = set_anon_super(sb, data);
- if (!error)
- sb->s_fs_info = data;
- return error;
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ kfc->ns_tag = NULL;
+ return set_anon_super_fc(sb, fc);
}
/**
@@ -294,63 +282,60 @@ const void *kernfs_super_ns(struct super_block *sb)
}
/**
- * kernfs_mount_ns - kernfs mount helper
- * @fs_type: file_system_type of the fs being mounted
- * @flags: mount flags specified for the mount
- * @root: kernfs_root of the hierarchy being mounted
- * @magic: file system specific magic number
- * @new_sb_created: tell the caller if we allocated a new superblock
- * @ns: optional namespace tag of the mount
+ * kernfs_get_tree - kernfs filesystem access/retrieval helper
+ * @fc: The filesystem context.
*
- * This is to be called from each kernfs user's file_system_type->mount()
- * implementation, which should pass through the specified @fs_type and
- * @flags, and specify the hierarchy and namespace tag to mount via @root
- * and @ns, respectively.
- *
- * The return value can be passed to the vfs layer verbatim.
+ * This is to be called from each kernfs user's fs_context->ops->get_tree()
+ * implementation, which should set the specified ->@fs_type and ->@flags, and
+ * specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
+ * respectively.
*/
-struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
- struct kernfs_root *root, unsigned long magic,
- bool *new_sb_created, const void *ns)
+int kernfs_get_tree(struct fs_context *fc)
{
+ struct kernfs_fs_context *kfc = fc->fs_private;
struct super_block *sb;
struct kernfs_super_info *info;
int error;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
- info->root = root;
- info->ns = ns;
+ info->root = kfc->root;
+ info->ns = kfc->ns_tag;
INIT_LIST_HEAD(&info->node);
- sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
- &init_user_ns, info);
- if (IS_ERR(sb) || sb->s_fs_info != info)
- kfree(info);
+ fc->s_fs_info = info;
+ sb = sget_fc(fc, kernfs_test_super, kernfs_set_super);
if (IS_ERR(sb))
- return ERR_CAST(sb);
-
- if (new_sb_created)
- *new_sb_created = !sb->s_root;
+ return PTR_ERR(sb);
if (!sb->s_root) {
struct kernfs_super_info *info = kernfs_info(sb);
- error = kernfs_fill_super(sb, magic);
+ kfc->new_sb_created = true;
+
+ error = kernfs_fill_super(sb, kfc);
if (error) {
deactivate_locked_super(sb);
- return ERR_PTR(error);
+ return error;
}
sb->s_flags |= SB_ACTIVE;
mutex_lock(&kernfs_mutex);
- list_add(&info->node, &root->supers);
+ list_add(&info->node, &info->root->supers);
mutex_unlock(&kernfs_mutex);
}
- return dget(sb->s_root);
+ fc->root = dget(sb->s_root);
+ return 0;
+}
+
+void kernfs_free_fs_context(struct fs_context *fc)
+{
+ /* Note that we don't deal with kfc->ns_tag here. */
+ kfree(fc->s_fs_info);
+ fc->s_fs_info = NULL;
}
/**
@@ -377,36 +362,6 @@ void kernfs_kill_sb(struct super_block *sb)
kfree(info);
}
-/**
- * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
- * @kernfs_root: the kernfs_root in question
- * @ns: the namespace tag
- *
- * Pin the superblock so the superblock won't be destroyed in subsequent
- * operations. This can be used to block ->kill_sb() which may be useful
- * for kernfs users which dynamically manage superblocks.
- *
- * Returns NULL if there's no superblock associated to this kernfs_root, or
- * -EINVAL if the superblock is being freed.
- */
-struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
-{
- struct kernfs_super_info *info;
- struct super_block *sb = NULL;
-
- mutex_lock(&kernfs_mutex);
- list_for_each_entry(info, &root->supers, node) {
- if (info->ns == ns) {
- sb = info->sb;
- if (!atomic_inc_not_zero(&info->sb->s_active))
- sb = ERR_PTR(-EINVAL);
- break;
- }
- }
- mutex_unlock(&kernfs_mutex);
- return sb;
-}
-
void __init kernfs_init(void)
{
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 214a2fa1f1e3..7df6324ccb8a 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -75,17 +75,6 @@ static void nlm4_compute_offsets(const struct nlm_lock *lock,
}
/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("lockd: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NLMv4 basic data types
*
* Basic NLMv4 data types are defined in Appendix II, section 6.1.4
@@ -176,7 +165,6 @@ out_size:
dprintk("NFS: returned cookie was too long: %u\n", length);
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -236,7 +224,6 @@ out_bad_xdr:
__func__, be32_to_cpup(p));
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -309,7 +296,6 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
out:
return error;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 747b9c8c940a..4df62f635529 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -71,17 +71,6 @@ static void nlm_compute_offsets(const struct nlm_lock *lock,
}
/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("lockd: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NLMv3 basic data types
*
* Basic NLMv3 data types are not defined in an IETF standards
@@ -173,7 +162,6 @@ out_size:
dprintk("NFS: returned cookie was too long: %u\n", length);
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -231,7 +219,6 @@ out_enum:
__func__, be32_to_cpup(p));
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -303,7 +290,6 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
out:
return error;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 93fb7cf0b92b..f0b5c987d6ae 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -290,12 +290,11 @@ void nlmclnt_release_host(struct nlm_host *host)
WARN_ON_ONCE(host->h_server);
- if (refcount_dec_and_test(&host->h_count)) {
+ if (refcount_dec_and_mutex_lock(&host->h_count, &nlm_host_mutex)) {
WARN_ON_ONCE(!list_empty(&host->h_lockowners));
WARN_ON_ONCE(!list_empty(&host->h_granted));
WARN_ON_ONCE(!list_empty(&host->h_reclaim));
- mutex_lock(&nlm_host_mutex);
nlm_destroy_host_locked(host);
mutex_unlock(&nlm_host_mutex);
}
diff --git a/fs/locks.c b/fs/locks.c
index eaa1cfaf73b0..71d0c6c2aac5 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1160,6 +1160,11 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
*/
error = -EDEADLK;
spin_lock(&blocked_lock_lock);
+ /*
+ * Ensure that we don't find any locks blocked on this
+ * request during deadlock detection.
+ */
+ __locks_wake_up_blocks(request);
if (likely(!posix_locks_deadlock(request, fl))) {
error = FILE_LOCK_DEFERRED;
__locks_insert_block(fl, request,
diff --git a/fs/mount.h b/fs/mount.h
index f39bc9da4d73..6250de544760 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -146,3 +146,8 @@ static inline bool is_local_mountpoint(struct dentry *dentry)
return __is_local_mountpoint(dentry);
}
+
+static inline bool is_anon_ns(struct mnt_namespace *ns)
+{
+ return ns->seq == 0;
+}
diff --git a/fs/namei.c b/fs/namei.c
index 0a8c5c27f90e..dede0147b3f6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2331,8 +2331,8 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
return err;
}
-static int filename_lookup(int dfd, struct filename *name, unsigned flags,
- struct path *path, struct path *root)
+int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ struct path *path, struct path *root)
{
int retval;
struct nameidata nd;
@@ -3460,6 +3460,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
}
+ ima_post_create_tmpfile(inode);
return child;
out_err:
diff --git a/fs/namespace.c b/fs/namespace.c
index 98a8c182af4f..c9cab307fa77 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
+#include <linux/fs_context.h>
#include "pnode.h"
#include "internal.h"
@@ -940,38 +941,81 @@ static struct mount *skip_mnt_tree(struct mount *p)
return p;
}
-struct vfsmount *
-vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+/**
+ * vfs_create_mount - Create a mount for a configured superblock
+ * @fc: The configuration context with the superblock attached
+ *
+ * Create a mount to an already configured superblock. If necessary, the
+ * caller should invoke vfs_get_tree() before calling this.
+ *
+ * Note that this does not attach the mount to anything.
+ */
+struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
struct mount *mnt;
- struct dentry *root;
- if (!type)
- return ERR_PTR(-ENODEV);
+ if (!fc->root)
+ return ERR_PTR(-EINVAL);
- mnt = alloc_vfsmnt(name);
+ mnt = alloc_vfsmnt(fc->source ?: "none");
if (!mnt)
return ERR_PTR(-ENOMEM);
- if (flags & SB_KERNMOUNT)
+ if (fc->sb_flags & SB_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
- root = mount_fs(type, flags, name, data);
- if (IS_ERR(root)) {
- mnt_free_id(mnt);
- free_vfsmnt(mnt);
- return ERR_CAST(root);
- }
+ atomic_inc(&fc->root->d_sb->s_active);
+ mnt->mnt.mnt_sb = fc->root->d_sb;
+ mnt->mnt.mnt_root = dget(fc->root);
+ mnt->mnt_mountpoint = mnt->mnt.mnt_root;
+ mnt->mnt_parent = mnt;
- mnt->mnt.mnt_root = root;
- mnt->mnt.mnt_sb = root->d_sb;
- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
- mnt->mnt_parent = mnt;
lock_mount_hash();
- list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
+ list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
unlock_mount_hash();
return &mnt->mnt;
}
+EXPORT_SYMBOL(vfs_create_mount);
+
+struct vfsmount *fc_mount(struct fs_context *fc)
+{
+ int err = vfs_get_tree(fc);
+ if (!err) {
+ up_write(&fc->root->d_sb->s_umount);
+ return vfs_create_mount(fc);
+ }
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(fc_mount);
+
+struct vfsmount *vfs_kern_mount(struct file_system_type *type,
+ int flags, const char *name,
+ void *data)
+{
+ struct fs_context *fc;
+ struct vfsmount *mnt;
+ int ret = 0;
+
+ if (!type)
+ return ERR_PTR(-EINVAL);
+
+ fc = fs_context_for_mount(type, flags);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ if (name)
+ ret = vfs_parse_fs_string(fc, "source",
+ name, strlen(name));
+ if (!ret)
+ ret = parse_monolithic_mount_data(fc, data);
+ if (!ret)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(ret);
+
+ put_fs_context(fc);
+ return mnt;
+}
EXPORT_SYMBOL_GPL(vfs_kern_mount);
struct vfsmount *
@@ -1013,27 +1057,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
mnt->mnt.mnt_flags = old->mnt.mnt_flags;
mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
- /* Don't allow unprivileged users to change mount flags */
- if (flag & CL_UNPRIVILEGED) {
- mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
-
- if (mnt->mnt.mnt_flags & MNT_READONLY)
- mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
-
- if (mnt->mnt.mnt_flags & MNT_NODEV)
- mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
-
- if (mnt->mnt.mnt_flags & MNT_NOSUID)
- mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
-
- if (mnt->mnt.mnt_flags & MNT_NOEXEC)
- mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
- }
-
- /* Don't allow unprivileged users to reveal what is under a mount */
- if ((flag & CL_UNPRIVILEGED) &&
- (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire)))
- mnt->mnt.mnt_flags |= MNT_LOCKED;
atomic_inc(&sb->s_active);
mnt->mnt.mnt_sb = sb;
@@ -1464,6 +1487,29 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
static void shrink_submounts(struct mount *mnt);
+static int do_umount_root(struct super_block *sb)
+{
+ int ret = 0;
+
+ down_write(&sb->s_umount);
+ if (!sb_rdonly(sb)) {
+ struct fs_context *fc;
+
+ fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
+ SB_RDONLY);
+ if (IS_ERR(fc)) {
+ ret = PTR_ERR(fc);
+ } else {
+ ret = parse_monolithic_mount_data(fc, NULL);
+ if (!ret)
+ ret = reconfigure_super(fc);
+ put_fs_context(fc);
+ }
+ }
+ up_write(&sb->s_umount);
+ return ret;
+}
+
static int do_umount(struct mount *mnt, int flags)
{
struct super_block *sb = mnt->mnt.mnt_sb;
@@ -1529,11 +1575,7 @@ static int do_umount(struct mount *mnt, int flags)
*/
if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
- down_write(&sb->s_umount);
- if (!sb_rdonly(sb))
- retval = do_remount_sb(sb, SB_RDONLY, NULL, 0);
- up_write(&sb->s_umount);
- return retval;
+ return do_umount_root(sb);
}
namespace_lock();
@@ -1839,6 +1881,33 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
return 0;
}
+static void lock_mnt_tree(struct mount *mnt)
+{
+ struct mount *p;
+
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
+ int flags = p->mnt.mnt_flags;
+ /* Don't allow unprivileged users to change mount flags */
+ flags |= MNT_LOCK_ATIME;
+
+ if (flags & MNT_READONLY)
+ flags |= MNT_LOCK_READONLY;
+
+ if (flags & MNT_NODEV)
+ flags |= MNT_LOCK_NODEV;
+
+ if (flags & MNT_NOSUID)
+ flags |= MNT_LOCK_NOSUID;
+
+ if (flags & MNT_NOEXEC)
+ flags |= MNT_LOCK_NOEXEC;
+ /* Don't allow unprivileged users to reveal what is under a mount */
+ if (list_empty(&p->mnt_expire))
+ flags |= MNT_LOCKED;
+ p->mnt.mnt_flags = flags;
+ }
+}
+
static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{
struct mount *p;
@@ -1956,6 +2025,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
struct mountpoint *dest_mp,
struct path *parent_path)
{
+ struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
HLIST_HEAD(tree_list);
struct mnt_namespace *ns = dest_mnt->mnt_ns;
struct mountpoint *smp;
@@ -2006,6 +2076,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
child->mnt_mountpoint);
if (q)
mnt_change_mountpoint(child, smp, q);
+ /* Notice when we are propagating across user namespaces */
+ if (child->mnt_parent->mnt_ns->user_ns != user_ns)
+ lock_mnt_tree(child);
commit_tree(child);
}
put_mountpoint(smp);
@@ -2313,7 +2386,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
int err;
struct super_block *sb = path->mnt->mnt_sb;
struct mount *mnt = real_mount(path->mnt);
- void *sec_opts = NULL;
+ struct fs_context *fc;
if (!check_mnt(mnt))
return -EINVAL;
@@ -2324,24 +2397,22 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
if (!can_change_locked_flags(mnt, mnt_flags))
return -EPERM;
- if (data && !(sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)) {
- err = security_sb_eat_lsm_opts(data, &sec_opts);
- if (err)
- return err;
- }
- err = security_sb_remount(sb, sec_opts);
- security_free_mnt_opts(&sec_opts);
- if (err)
- return err;
+ fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
- down_write(&sb->s_umount);
- err = -EPERM;
- if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
- err = do_remount_sb(sb, sb_flags, data, 0);
- if (!err)
- set_mount_attributes(mnt, mnt_flags);
+ err = parse_monolithic_mount_data(fc, data);
+ if (!err) {
+ down_write(&sb->s_umount);
+ err = -EPERM;
+ if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
+ err = reconfigure_super(fc);
+ if (!err)
+ set_mount_attributes(mnt, mnt_flags);
+ }
+ up_write(&sb->s_umount);
}
- up_write(&sb->s_umount);
+ put_fs_context(fc);
return err;
}
@@ -2425,29 +2496,6 @@ out:
return err;
}
-static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
-{
- int err;
- const char *subtype = strchr(fstype, '.');
- if (subtype) {
- subtype++;
- err = -EINVAL;
- if (!subtype[0])
- goto err;
- } else
- subtype = "";
-
- mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
- err = -ENOMEM;
- if (!mnt->mnt_sb->s_subtype)
- goto err;
- return mnt;
-
- err:
- mntput(mnt);
- return ERR_PTR(err);
-}
-
/*
* add a mount into a namespace's mount tree
*/
@@ -2492,7 +2540,39 @@ unlock:
return err;
}
-static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
+static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
+
+/*
+ * Create a new mount using a superblock configuration and request it
+ * be added to the namespace tree.
+ */
+static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
+ unsigned int mnt_flags)
+{
+ struct vfsmount *mnt;
+ struct super_block *sb = fc->root->d_sb;
+ int error;
+
+ error = security_sb_kern_mount(sb);
+ if (!error && mount_too_revealing(sb, &mnt_flags))
+ error = -EPERM;
+
+ if (unlikely(error)) {
+ fc_drop_locked(fc);
+ return error;
+ }
+
+ up_write(&sb->s_umount);
+
+ mnt = vfs_create_mount(fc);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
+ if (error < 0)
+ mntput(mnt);
+ return error;
+}
/*
* create a new mount for userspace and request it to be added into the
@@ -2502,8 +2582,9 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
- struct vfsmount *mnt;
- int err;
+ struct fs_context *fc;
+ const char *subtype = NULL;
+ int err = 0;
if (!fstype)
return -EINVAL;
@@ -2512,23 +2593,37 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
if (!type)
return -ENODEV;
- mnt = vfs_kern_mount(type, sb_flags, name, data);
- if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
- !mnt->mnt_sb->s_subtype)
- mnt = fs_set_subtype(mnt, fstype);
+ if (type->fs_flags & FS_HAS_SUBTYPE) {
+ subtype = strchr(fstype, '.');
+ if (subtype) {
+ subtype++;
+ if (!*subtype) {
+ put_filesystem(type);
+ return -EINVAL;
+ }
+ } else {
+ subtype = "";
+ }
+ }
+ fc = fs_context_for_mount(type, sb_flags);
put_filesystem(type);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
-
- if (mount_too_revealing(mnt, &mnt_flags)) {
- mntput(mnt);
- return -EPERM;
- }
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
+
+ if (subtype)
+ err = vfs_parse_fs_string(fc, "subtype",
+ subtype, strlen(subtype));
+ if (!err && name)
+ err = vfs_parse_fs_string(fc, "source", name, strlen(name));
+ if (!err)
+ err = parse_monolithic_mount_data(fc, data);
+ if (!err)
+ err = vfs_get_tree(fc);
+ if (!err)
+ err = do_new_mount_fc(fc, path, mnt_flags);
- err = do_add_mount(real_mount(mnt), path, mnt_flags);
- if (err)
- mntput(mnt);
+ put_fs_context(fc);
return err;
}
@@ -2863,7 +2958,8 @@ static void dec_mnt_namespaces(struct ucounts *ucounts)
static void free_mnt_ns(struct mnt_namespace *ns)
{
- ns_free_inum(&ns->ns);
+ if (!is_anon_ns(ns))
+ ns_free_inum(&ns->ns);
dec_mnt_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
kfree(ns);
@@ -2878,7 +2974,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
*/
static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
-static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
struct mnt_namespace *new_ns;
struct ucounts *ucounts;
@@ -2888,28 +2984,27 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
if (!ucounts)
return ERR_PTR(-ENOSPC);
- new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+ new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
if (!new_ns) {
dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
}
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
- dec_mnt_namespaces(ucounts);
- return ERR_PTR(ret);
+ if (!anon) {
+ ret = ns_alloc_inum(&new_ns->ns);
+ if (ret) {
+ kfree(new_ns);
+ dec_mnt_namespaces(ucounts);
+ return ERR_PTR(ret);
+ }
}
new_ns->ns.ops = &mntns_operations;
- new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
+ if (!anon)
+ new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
atomic_set(&new_ns->count, 1);
- new_ns->root = NULL;
INIT_LIST_HEAD(&new_ns->list);
init_waitqueue_head(&new_ns->poll);
- new_ns->event = 0;
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
- new_ns->mounts = 0;
- new_ns->pending_mounts = 0;
return new_ns;
}
@@ -2933,7 +3028,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
old = ns->root;
- new_ns = alloc_mnt_ns(user_ns);
+ new_ns = alloc_mnt_ns(user_ns, false);
if (IS_ERR(new_ns))
return new_ns;
@@ -2941,13 +3036,18 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
/* First pass: copy the tree topology */
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
- copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
+ copy_flags |= CL_SHARED_TO_SLAVE;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
free_mnt_ns(new_ns);
return ERR_CAST(new);
}
+ if (user_ns != ns->user_ns) {
+ lock_mount_hash();
+ lock_mnt_tree(new);
+ unlock_mount_hash();
+ }
new_ns->root = new;
list_add_tail(&new_ns->list, &new->mnt_list);
@@ -2988,37 +3088,25 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
return new_ns;
}
-/**
- * create_mnt_ns - creates a private namespace and adds a root filesystem
- * @mnt: pointer to the new root filesystem mountpoint
- */
-static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
-{
- struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
- if (!IS_ERR(new_ns)) {
- struct mount *mnt = real_mount(m);
- mnt->mnt_ns = new_ns;
- new_ns->root = mnt;
- new_ns->mounts++;
- list_add(&mnt->mnt_list, &new_ns->list);
- } else {
- mntput(m);
- }
- return new_ns;
-}
-
-struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
+struct dentry *mount_subtree(struct vfsmount *m, const char *name)
{
+ struct mount *mnt = real_mount(m);
struct mnt_namespace *ns;
struct super_block *s;
struct path path;
int err;
- ns = create_mnt_ns(mnt);
- if (IS_ERR(ns))
+ ns = alloc_mnt_ns(&init_user_ns, true);
+ if (IS_ERR(ns)) {
+ mntput(m);
return ERR_CAST(ns);
+ }
+ mnt->mnt_ns = ns;
+ ns->root = mnt;
+ ns->mounts++;
+ list_add(&mnt->mnt_list, &ns->list);
- err = vfs_path_lookup(mnt->mnt_root, mnt,
+ err = vfs_path_lookup(m->mnt_root, m,
name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
put_mnt_ns(ns);
@@ -3228,6 +3316,7 @@ out0:
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
+ struct mount *m;
struct mnt_namespace *ns;
struct path root;
struct file_system_type *type;
@@ -3240,10 +3329,14 @@ static void __init init_mount_tree(void)
if (IS_ERR(mnt))
panic("Can't create rootfs");
- ns = create_mnt_ns(mnt);
+ ns = alloc_mnt_ns(&init_user_ns, false);
if (IS_ERR(ns))
panic("Can't allocate initial namespace");
-
+ m = real_mount(mnt);
+ m->mnt_ns = ns;
+ ns->root = m;
+ ns->mounts = 1;
+ list_add(&m->mnt_list, &ns->list);
init_task.nsproxy->mnt_ns = ns;
get_mnt_ns(ns);
@@ -3297,10 +3390,10 @@ void put_mnt_ns(struct mnt_namespace *ns)
free_mnt_ns(ns);
}
-struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
+struct vfsmount *kern_mount(struct file_system_type *type)
{
struct vfsmount *mnt;
- mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data);
+ mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
if (!IS_ERR(mnt)) {
/*
* it is a longterm mount, don't release mnt until
@@ -3310,7 +3403,7 @@ struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
}
return mnt;
}
-EXPORT_SYMBOL_GPL(kern_mount_data);
+EXPORT_SYMBOL_GPL(kern_mount);
void kern_unmount(struct vfsmount *mnt)
{
@@ -3352,7 +3445,8 @@ bool current_chrooted(void)
return chrooted;
}
-static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
+static bool mnt_already_visible(struct mnt_namespace *ns,
+ const struct super_block *sb,
int *new_mnt_flags)
{
int new_flags = *new_mnt_flags;
@@ -3364,7 +3458,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
struct mount *child;
int mnt_flags;
- if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
+ if (mnt->mnt.mnt_sb->s_type != sb->s_type)
continue;
/* This mount is not fully visible if it's root directory
@@ -3415,7 +3509,7 @@ found:
return visible;
}
-static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
+static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
{
const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
@@ -3425,7 +3519,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
return false;
/* Can this filesystem be too revealing? */
- s_iflags = mnt->mnt_sb->s_iflags;
+ s_iflags = sb->s_iflags;
if (!(s_iflags & SB_I_USERNS_VISIBLE))
return false;
@@ -3435,7 +3529,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
return true;
}
- return !mnt_already_visible(ns, mnt, new_mnt_flags);
+ return !mnt_already_visible(ns, sb, new_mnt_flags);
}
bool mnt_may_suid(struct vfsmount *mnt)
@@ -3484,6 +3578,9 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
+ if (is_anon_ns(mnt_ns))
+ return -EINVAL;
+
if (fs->users != 1)
return -EINVAL;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a87a56273407..06233bfa6d73 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -72,16 +72,6 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p)
return xdr_ressize_check(rqstp, p);
}
-static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes)
-{
- __be32 *p;
-
- p = xdr_inline_decode(xdr, nbytes);
- if (unlikely(p == NULL))
- printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n");
- return p;
-}
-
static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len,
const char **str, size_t maxlen)
{
@@ -98,13 +88,13 @@ static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
{
__be32 *p;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
fh->size = ntohl(*p);
if (fh->size > NFS4_FHSIZE)
return htonl(NFS4ERR_BADHANDLE);
- p = read_buf(xdr, fh->size);
+ p = xdr_inline_decode(xdr, fh->size);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
memcpy(&fh->data[0], p, fh->size);
@@ -117,11 +107,11 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
__be32 *p;
unsigned int attrlen;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
attrlen = ntohl(*p);
- p = read_buf(xdr, attrlen << 2);
+ p = xdr_inline_decode(xdr, attrlen << 2);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
if (likely(attrlen > 0))
@@ -135,7 +125,7 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
{
__be32 *p;
- p = read_buf(xdr, NFS4_STATEID_SIZE);
+ p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
memcpy(stateid->data, p, NFS4_STATEID_SIZE);
@@ -156,7 +146,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ);
if (unlikely(status != 0))
return status;
- p = read_buf(xdr, 12);
+ p = xdr_inline_decode(xdr, 12);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
hdr->minorversion = ntohl(*p++);
@@ -176,7 +166,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
{
__be32 *p;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE_HDR);
*op = ntohl(*p);
@@ -205,7 +195,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp,
status = decode_delegation_stateid(xdr, &args->stateid);
if (unlikely(status != 0))
return status;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
args->truncate = ntohl(*p);
@@ -227,7 +217,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
__be32 status = 0;
uint32_t iomode;
- p = read_buf(xdr, 4 * sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -245,14 +235,14 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
if (unlikely(status != 0))
return status;
- p = read_buf(xdr, 2 * sizeof(uint64_t));
+ p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbl_range.offset);
p = xdr_decode_hyper(p, &args->cbl_range.length);
return decode_layout_stateid(xdr, &args->cbl_stateid);
} else if (args->cbl_recall_type == RETURN_FSID) {
- p = read_buf(xdr, 2 * sizeof(uint64_t));
+ p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbl_fsid.major);
@@ -275,7 +265,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
args->ndevs = 0;
/* Num of device notifications */
- p = read_buf(xdr, sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto out;
@@ -298,7 +288,8 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
for (i = 0; i < n; i++) {
struct cb_devicenotifyitem *dev = &args->devs[i];
- p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+ p = xdr_inline_decode(xdr, (4 * sizeof(uint32_t)) +
+ NFS4_DEVICEID4_SIZE);
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto err;
@@ -329,7 +320,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
- p = read_buf(xdr, sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto err;
@@ -359,7 +350,7 @@ static __be32 decode_sessionid(struct xdr_stream *xdr,
{
__be32 *p;
- p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN);
+ p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
@@ -379,13 +370,13 @@ static __be32 decode_rc_list(struct xdr_stream *xdr,
goto out;
status = htonl(NFS4ERR_RESOURCE);
- p = read_buf(xdr, sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
if (unlikely(p == NULL))
goto out;
rc_list->rcl_nrefcalls = ntohl(*p++);
if (rc_list->rcl_nrefcalls) {
- p = read_buf(xdr,
+ p = xdr_inline_decode(xdr,
rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
if (unlikely(p == NULL))
goto out;
@@ -418,7 +409,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
if (status)
return status;
- p = read_buf(xdr, 5 * sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, 5 * sizeof(uint32_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
@@ -461,7 +452,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
uint32_t bitmap[2];
__be32 *p, status;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
args->craa_objs_to_keep = ntohl(*p++);
@@ -480,7 +471,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
struct cb_recallslotargs *args = argp;
__be32 *p;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
args->crsa_target_highest_slotid = ntohl(*p++);
@@ -492,14 +483,14 @@ static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_arg
__be32 *p;
unsigned int len;
- p = read_buf(xdr, 12);
+ p = xdr_inline_decode(xdr, 12);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbnl_owner.clientid);
len = be32_to_cpu(*p);
- p = read_buf(xdr, len);
+ p = xdr_inline_decode(xdr, len);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -537,7 +528,7 @@ static __be32 decode_write_response(struct xdr_stream *xdr,
__be32 *p;
/* skip the always zero field */
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out;
p++;
@@ -577,7 +568,7 @@ static __be32 decode_offload_args(struct svc_rqst *rqstp,
return status;
/* decode status */
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out;
args->error = ntohl(*p++);
@@ -943,10 +934,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
};
unsigned int nops = 0;
- xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
+ xdr_init_decode(&xdr_in, &rqstp->rq_arg,
+ rqstp->rq_arg.head[0].iov_base, NULL);
p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
- xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
+ xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL);
status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
if (status == htonl(NFS4ERR_RESOURCE))
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index fb1cf1a4bda2..90d71fda65ce 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -453,7 +453,7 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
case XPRT_TRANSPORT_RDMA:
if (retrans == NFS_UNSPEC_RETRANS)
to->to_retries = NFS_DEF_TCP_RETRANS;
- if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0)
+ if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
to->to_initval = NFS_MAX_TCP_TIMEOUT;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 885363ca8569..2f6b447cdd82 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -229,6 +229,8 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
spin_lock(&delegation->lock);
if (delegation->inode != NULL)
inode = igrab(delegation->inode);
+ if (!inode)
+ set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags);
spin_unlock(&delegation->lock);
return inode;
}
@@ -681,7 +683,7 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
/**
* nfs_super_return_all_delegations - return delegations for one superblock
- * @sb: sb to process
+ * @server: pointer to nfs_server to process
*
*/
void nfs_server_return_all_delegations(struct nfs_server *server)
@@ -944,10 +946,11 @@ restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry_rcu(delegation, &server->delegations,
super_list) {
- if (test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags))
- continue;
- if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_NEED_RECLAIM,
&delegation->flags) == 0)
continue;
if (!nfs_sb_active(server->super))
@@ -1053,10 +1056,11 @@ restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry_rcu(delegation, &server->delegations,
super_list) {
- if (test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags))
- continue;
- if (test_bit(NFS_DELEGATION_TEST_EXPIRED,
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_TEST_EXPIRED,
&delegation->flags) == 0)
continue;
if (!nfs_sb_active(server->super))
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index dcbf3394ba0e..35b4b02c1ae0 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,6 +34,7 @@ enum {
NFS_DELEGATION_RETURNING,
NFS_DELEGATION_REVOKED,
NFS_DELEGATION_TEST_EXPIRED,
+ NFS_DELEGATION_INODE_FREEING,
};
int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6bf4471850c8..a71d0b42d160 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -139,12 +139,19 @@ struct nfs_cache_array {
struct nfs_cache_array_entry array[0];
};
+struct readdirvec {
+ unsigned long nr;
+ unsigned long index;
+ struct page *pages[NFS_MAX_READDIR_RAPAGES];
+};
+
typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool);
typedef struct {
struct file *file;
struct page *page;
struct dir_context *ctx;
unsigned long page_index;
+ struct readdirvec pvec;
u64 *dir_cookie;
u64 last_cookie;
loff_t current_index;
@@ -524,6 +531,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
struct nfs_cache_array *array;
unsigned int count = 0;
int status;
+ int max_rapages = NFS_MAX_READDIR_RAPAGES;
+
+ desc->pvec.index = desc->page_index;
+ desc->pvec.nr = 0;
scratch = alloc_page(GFP_KERNEL);
if (scratch == NULL)
@@ -548,20 +559,40 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
if (desc->plus)
nfs_prime_dcache(file_dentry(desc->file), entry);
- status = nfs_readdir_add_to_array(entry, page);
+ status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]);
+ if (status == -ENOSPC) {
+ desc->pvec.nr++;
+ if (desc->pvec.nr == max_rapages)
+ break;
+ status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]);
+ }
if (status != 0)
break;
} while (!entry->eof);
+ /*
+ * page and desc->pvec.pages[0] are valid, don't need to check
+ * whether or not to be NULL.
+ */
+ copy_highpage(page, desc->pvec.pages[0]);
+
out_nopages:
if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
- array = kmap(page);
+ array = kmap_atomic(desc->pvec.pages[desc->pvec.nr]);
array->eof_index = array->size;
status = 0;
- kunmap(page);
+ kunmap_atomic(array);
}
put_page(scratch);
+
+ /*
+ * desc->pvec.nr > 0 means at least one page was completely filled,
+ * we should return -ENOSPC. Otherwise function
+ * nfs_readdir_xdr_to_array will enter infinite loop.
+ */
+ if (desc->pvec.nr > 0)
+ return -ENOSPC;
return status;
}
@@ -574,8 +605,8 @@ void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
}
/*
- * nfs_readdir_large_page will allocate pages that must be freed with a call
- * to nfs_readdir_free_pagearray
+ * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call
+ * to nfs_readdir_free_pages()
*/
static
int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
@@ -595,6 +626,24 @@ out_freepages:
return -ENOMEM;
}
+/*
+ * nfs_readdir_rapages_init initialize rapages by nfs_cache_array structure.
+ */
+static
+void nfs_readdir_rapages_init(nfs_readdir_descriptor_t *desc)
+{
+ struct nfs_cache_array *array;
+ int max_rapages = NFS_MAX_READDIR_RAPAGES;
+ int index;
+
+ for (index = 0; index < max_rapages; index++) {
+ array = kmap_atomic(desc->pvec.pages[index]);
+ memset(array, 0, sizeof(struct nfs_cache_array));
+ array->eof_index = -1;
+ kunmap_atomic(array);
+ }
+}
+
static
int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
{
@@ -605,6 +654,12 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
int status = -ENOMEM;
unsigned int array_size = ARRAY_SIZE(pages);
+ /*
+ * This means we hit readdir rdpages miss, the preallocated rdpages
+ * are useless, the preallocate rdpages should be reinitialized.
+ */
+ nfs_readdir_rapages_init(desc);
+
entry.prev_cookie = 0;
entry.cookie = desc->last_cookie;
entry.eof = 0;
@@ -664,9 +719,24 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
struct inode *inode = file_inode(desc->file);
int ret;
- ret = nfs_readdir_xdr_to_array(desc, page, inode);
- if (ret < 0)
- goto error;
+ /*
+ * If desc->page_index in range desc->pvec.index and
+ * desc->pvec.index + desc->pvec.nr, we get readdir cache hit.
+ */
+ if (desc->page_index >= desc->pvec.index &&
+ desc->page_index < (desc->pvec.index + desc->pvec.nr)) {
+ /*
+ * page and desc->pvec.pages[x] are valid, don't need to check
+ * whether or not to be NULL.
+ */
+ copy_highpage(page, desc->pvec.pages[desc->page_index - desc->pvec.index]);
+ ret = 0;
+ } else {
+ ret = nfs_readdir_xdr_to_array(desc, page, inode);
+ if (ret < 0)
+ goto error;
+ }
+
SetPageUptodate(page);
if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
@@ -831,6 +901,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
*desc = &my_desc;
struct nfs_open_dir_context *dir_ctx = file->private_data;
int res = 0;
+ int max_rapages = NFS_MAX_READDIR_RAPAGES;
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
file, (long long)ctx->pos);
@@ -850,6 +921,12 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->decode = NFS_PROTO(inode)->decode_dirent;
desc->plus = nfs_use_readdirplus(inode, ctx);
+ res = nfs_readdir_alloc_pages(desc->pvec.pages, max_rapages);
+ if (res < 0)
+ return -ENOMEM;
+
+ nfs_readdir_rapages_init(desc);
+
if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
@@ -885,6 +962,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
} while (!desc->eof);
out:
+ nfs_readdir_free_pages(desc->pvec.pages, max_rapages);
if (res > 0)
res = 0;
dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
@@ -945,7 +1023,7 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
/**
* nfs_force_lookup_revalidate - Mark the directory as having changed
- * @dir - pointer to directory inode
+ * @dir: pointer to directory inode
*
* This forces the revalidation code in nfs_lookup_revalidate() to do a
* full lookup on all child dentries of 'dir' whenever a change occurs
@@ -1649,7 +1727,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
reval_dentry:
if (flags & LOOKUP_RCU)
return -ECHILD;
- return nfs_lookup_revalidate_dentry(dir, dentry, inode);;
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode);
full_reval:
return nfs_do_lookup_revalidate(dir, dentry, flags);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 33824a0a57bf..0fd811ac08b5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -428,7 +428,7 @@ out_put:
hdr->release(hdr);
}
-static void nfs_read_sync_pgio_error(struct list_head *head)
+static void nfs_read_sync_pgio_error(struct list_head *head, int error)
{
struct nfs_page *req;
@@ -664,8 +664,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
if (!nfs_pageio_add_request(&desc, req)) {
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &failed);
+ nfs_list_move_request(req, &failed);
spin_lock(&cinfo.inode->i_lock);
dreq->flags = 0;
if (desc.pg_error < 0)
@@ -821,7 +820,7 @@ out_put:
hdr->release(hdr);
}
-static void nfs_write_sync_pgio_error(struct list_head *head)
+static void nfs_write_sync_pgio_error(struct list_head *head, int error)
{
struct nfs_page *req;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 29553fdba8af..4899b85f9b3c 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -89,8 +89,8 @@ EXPORT_SYMBOL_GPL(nfs_file_release);
/**
* nfs_revalidate_size - Revalidate the file size
- * @inode - pointer to inode struct
- * @file - pointer to struct file
+ * @inode: pointer to inode struct
+ * @filp: pointer to struct file
*
* Revalidates the file length. This is basically a wrapper around
* nfs_revalidate_inode() that takes into account the fact that we may
@@ -276,6 +276,12 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync);
* then a modify/write/read cycle when writing to a page in the
* page cache.
*
+ * Some pNFS layout drivers can only read/write at a certain block
+ * granularity like all block devices and therefore we must perform
+ * read/modify/write whenever a page hasn't read yet and the data
+ * to be written there is not aligned to a block boundary and/or
+ * smaller than the block size.
+ *
* The modify/write/read cycle may occur if a page is read before
* being completely filled by the writer. In this situation, the
* page must be completely written to stable storage on the server
@@ -291,26 +297,32 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync);
* and that the new data won't completely replace the old data in
* that range of the file.
*/
-static int nfs_want_read_modify_write(struct file *file, struct page *page,
- loff_t pos, unsigned len)
+static bool nfs_full_page_write(struct page *page, loff_t pos, unsigned int len)
{
unsigned int pglen = nfs_page_length(page);
unsigned int offset = pos & (PAGE_SIZE - 1);
unsigned int end = offset + len;
- if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
- if (!PageUptodate(page))
- return 1;
- return 0;
- }
+ return !pglen || (end >= pglen && !offset);
+}
- if ((file->f_mode & FMODE_READ) && /* open for read? */
- !PageUptodate(page) && /* Uptodate? */
- !PagePrivate(page) && /* i/o request already? */
- pglen && /* valid bytes of file? */
- (end < pglen || offset)) /* replace all valid bytes? */
- return 1;
- return 0;
+static bool nfs_want_read_modify_write(struct file *file, struct page *page,
+ loff_t pos, unsigned int len)
+{
+ /*
+ * Up-to-date pages, those with ongoing or full-page write
+ * don't need read/modify/write
+ */
+ if (PageUptodate(page) || PagePrivate(page) ||
+ nfs_full_page_write(page, pos, len))
+ return false;
+
+ if (pnfs_ld_read_whole_page(file->f_mapping->host))
+ return true;
+ /* Open for reading too? */
+ if (file->f_mode & FMODE_READ)
+ return true;
+ return false;
}
/*
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 63abe705f4ca..6673d4ff5a2a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -410,7 +410,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
struct cred *kcred;
- const struct cred *cred;
+ const struct cred __rcu *cred;
kuid_t uid;
kgid_t gid;
u32 ds_count, fh_count, id;
@@ -501,7 +501,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
kcred->fsuid = uid;
kcred->fsgid = gid;
- cred = kcred;
+ cred = RCU_INITIALIZER(kcred);
if (lgr->range.iomode == IOMODE_READ)
rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
@@ -788,30 +788,82 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
}
}
+static void
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (devid)
+ nfs4_mark_deviceid_unavailable(devid);
+}
+
+static void
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (devid)
+ nfs4_mark_deviceid_available(devid);
+}
+
static struct nfs4_pnfs_ds *
-ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx,
- int *best_idx)
+ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx,
+ bool check_device)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+ struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
bool fail_return = false;
int idx;
- /* mirrors are sorted by efficiency */
+ /* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
if (idx+1 == fls->mirror_array_cnt)
- fail_return = true;
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return);
- if (ds) {
- *best_idx = idx;
- return ds;
- }
+ fail_return = !check_device;
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return);
+ if (!ds)
+ continue;
+
+ if (check_device &&
+ nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
+ continue;
+
+ *best_idx = idx;
+ return ds;
}
return NULL;
}
+static struct nfs4_pnfs_ds *
+ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx)
+{
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx)
+{
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx)
+{
+ struct nfs4_pnfs_ds *ds;
+
+ ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
+ if (ds)
+ return ds;
+ return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
+}
+
static void
ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req,
@@ -925,7 +977,8 @@ retry:
goto out_mds;
for (i = 0; i < pgio->pg_mirror_count; i++) {
- ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
if (!ds) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
@@ -936,7 +989,6 @@ retry:
goto retry;
}
pgm = &pgio->pg_mirrors[i];
- mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
}
@@ -1071,6 +1123,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
break;
case -NFS4ERR_RETRY_UNCACHED_REP:
break;
+ case -EAGAIN:
+ return -NFS4ERR_RESET_TO_PNFS;
/* Invalidate Layout errors */
case -NFS4ERR_PNFS_NO_LAYOUT:
case -ESTALE: /* mapped NFS4ERR_STALE */
@@ -1131,6 +1185,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
case -EBADHANDLE:
case -ELOOP:
case -ENOSPC:
+ case -EAGAIN:
break;
case -EJUKEBOX:
nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
@@ -1158,8 +1213,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
{
int vers = clp->cl_nfs_mod->rpc_vers->number;
- if (task->tk_status >= 0)
+ if (task->tk_status >= 0) {
+ ff_layout_mark_ds_reachable(lseg, idx);
return 0;
+ }
/* Handle the case of an invalid layout segment */
if (!pnfs_is_valid_lseg(lseg))
@@ -1222,6 +1279,8 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
GFP_NOIO);
+ if (status == NFS4ERR_NXIO)
+ ff_layout_mark_ds_unreachable(lseg, idx);
pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
}
@@ -1230,6 +1289,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ int new_idx = hdr->pgio_mirror_idx;
int err;
trace_nfs4_pnfs_read(hdr, task->tk_status);
@@ -1248,8 +1308,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
case -NFS4ERR_RESET_TO_PNFS:
if (ff_layout_choose_best_ds_for_read(hdr->lseg,
hdr->pgio_mirror_idx + 1,
- &hdr->pgio_mirror_idx))
- goto out_eagain;
+ &new_idx))
+ goto out_layouterror;
set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
@@ -1260,6 +1320,10 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
}
return 0;
+out_layouterror:
+ ff_layout_read_record_layoutstats_done(task, hdr);
+ ff_layout_send_layouterror(hdr->lseg);
+ hdr->pgio_mirror_idx = new_idx;
out_eagain:
rpc_restart_call_prepare(task);
return -EAGAIN;
@@ -1293,15 +1357,6 @@ ff_layout_set_layoutcommit(struct inode *inode,
(unsigned long long) NFS_I(inode)->layout->plh_lwb);
}
-static bool
-ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx)
-{
- /* No mirroring for now */
- struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
-
- return ff_layout_test_devid_unavailable(node);
-}
-
static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
@@ -1332,10 +1387,6 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
rpc_exit(task, -EIO);
return -EIO;
}
- if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
- rpc_exit(task, -EHOSTDOWN);
- return -EAGAIN;
- }
ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
@@ -1369,6 +1420,16 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
ff_layout_read_prepare_common(task, hdr);
}
+static void
+ff_layout_io_prepare_transmit(struct rpc_task *task,
+ void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (!pnfs_is_valid_lseg(hdr->lseg))
+ rpc_exit(task, -EAGAIN);
+}
+
static void ff_layout_read_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
@@ -1399,9 +1460,10 @@ static void ff_layout_read_release(void *data)
struct nfs_pgio_header *hdr = data;
ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
- if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
+ ff_layout_send_layouterror(hdr->lseg);
pnfs_read_resend_pnfs(hdr);
- else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
ff_layout_reset_read(hdr);
pnfs_generic_rw_release(data);
}
@@ -1513,11 +1575,6 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EIO;
}
- if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
- rpc_exit(task, -EHOSTDOWN);
- return -EAGAIN;
- }
-
ff_layout_write_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1573,9 +1630,10 @@ static void ff_layout_write_release(void *data)
struct nfs_pgio_header *hdr = data;
ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
- if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
+ ff_layout_send_layouterror(hdr->lseg);
ff_layout_reset_write(hdr, true);
- else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
ff_layout_reset_write(hdr, false);
pnfs_generic_rw_release(data);
}
@@ -1657,6 +1715,7 @@ static void ff_layout_commit_release(void *data)
static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
.rpc_call_prepare = ff_layout_read_prepare_v3,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
.rpc_release = ff_layout_read_release,
@@ -1664,6 +1723,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
.rpc_call_prepare = ff_layout_read_prepare_v4,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
.rpc_release = ff_layout_read_release,
@@ -1671,6 +1731,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
.rpc_call_prepare = ff_layout_write_prepare_v3,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
.rpc_release = ff_layout_write_release,
@@ -1678,6 +1739,7 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
.rpc_call_prepare = ff_layout_write_prepare_v4,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
.rpc_release = ff_layout_write_release,
@@ -1703,6 +1765,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
loff_t offset = hdr->args.offset;
u32 idx = hdr->pgio_mirror_idx;
@@ -1713,20 +1776,21 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
__func__, hdr->inode->i_ino,
hdr->args.pgbase, (size_t)hdr->args.count, offset);
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
if (!ds)
goto out_failed;
- ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
hdr->inode);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(lseg, idx);
+ vers = nfs4_ff_layout_ds_version(mirror);
dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
@@ -1734,13 +1798,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->pgio_done_cb = ff_layout_read_done_cb;
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+ fh = nfs4_ff_layout_select_ds_fh(mirror);
if (fh)
hdr->args.fh = fh;
- if (vers == 4 &&
- !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
- goto out_failed;
+ nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1770,26 +1832,28 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
loff_t offset = hdr->args.offset;
int vers;
struct nfs_fh *fh;
int idx = hdr->pgio_mirror_idx;
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
if (!ds)
goto out_failed;
- ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
hdr->inode);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(lseg, idx);
+ vers = nfs4_ff_layout_ds_version(mirror);
dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
@@ -1800,13 +1864,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
hdr->ds_commit_idx = idx;
- fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+ fh = nfs4_ff_layout_select_ds_fh(mirror);
if (fh)
hdr->args.fh = fh;
- if (vers == 4 &&
- !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
- goto out_failed;
+ nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1849,6 +1911,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
struct pnfs_layout_segment *lseg = data->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
u32 idx;
int vers, ret;
@@ -1859,20 +1922,21 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
goto out_err;
idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
if (!ds)
goto out_err;
- ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
data->inode);
if (IS_ERR(ds_clnt))
goto out_err;
- ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
if (!ds_cred)
goto out_err;
- vers = nfs4_ff_layout_ds_version(lseg, idx);
+ vers = nfs4_ff_layout_ds_version(mirror);
dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
@@ -2036,7 +2100,7 @@ ff_layout_encode_layoutreturn(struct xdr_stream *xdr,
dprintk("%s: Begin\n", __func__);
- xdr_init_encode(&tmp_xdr, &tmp_buf, NULL);
+ xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL);
ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
@@ -2102,6 +2166,52 @@ out_nomem:
return -ENOMEM;
}
+#ifdef CONFIG_NFS_V4_2
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ struct nfs42_layout_error *errors;
+ LIST_HEAD(head);
+
+ if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR))
+ return;
+ ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1);
+ if (list_empty(&head))
+ return;
+
+ errors = kmalloc_array(NFS42_LAYOUTERROR_MAX,
+ sizeof(*errors), GFP_NOFS);
+ if (errors != NULL) {
+ const struct nfs4_ff_layout_ds_err *pos;
+ size_t n = 0;
+
+ list_for_each_entry(pos, &head, list) {
+ errors[n].offset = pos->offset;
+ errors[n].length = pos->length;
+ nfs4_stateid_copy(&errors[n].stateid, &pos->stateid);
+ errors[n].errors[0].dev_id = pos->deviceid;
+ errors[n].errors[0].status = pos->status;
+ errors[n].errors[0].opnum = pos->opnum;
+ n++;
+ if (!list_is_last(&pos->list, &head) &&
+ n < NFS42_LAYOUTERROR_MAX)
+ continue;
+ if (nfs42_proc_layouterror(lseg, errors, n) < 0)
+ break;
+ n = 0;
+ }
+ kfree(errors);
+ }
+ ff_layout_free_ds_ioerr(&head);
+}
+#else
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+}
+#endif
+
static int
ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
{
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index c2626bad466b..2f369966abf7 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -132,16 +132,6 @@ FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
generic_hdr);
}
-static inline struct nfs4_deviceid_node *
-FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
-{
- if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
- FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
- FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
- return NULL;
- return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
-}
-
static inline struct nfs4_ff_layout_ds *
FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
{
@@ -151,9 +141,25 @@ FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
static inline struct nfs4_ff_layout_mirror *
FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
{
- if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
- return NULL;
- return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+
+ if (idx < fls->mirror_array_cnt)
+ return fls->mirror_array[idx];
+ return NULL;
+}
+
+static inline struct nfs4_deviceid_node *
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx);
+
+ if (mirror != NULL) {
+ struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds;
+
+ if (!IS_ERR_OR_NULL(mirror_ds))
+ return &mirror_ds->id_node;
+ }
+ return NULL;
}
static inline u32
@@ -174,28 +180,10 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO;
}
-static inline bool
-ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
-{
- /*
- * Flexfiles should never mark a DS unavailable, but if it does
- * print a (ratelimited) warning as this can affect performance.
- */
- if (nfs4_test_deviceid_unavailable(node)) {
- u32 *p = (u32 *)node->deviceid.data;
-
- pr_warn_ratelimited("NFS: flexfiles layout referencing an "
- "unavailable device [%x%x%x%x]\n",
- p[0], p[1], p[2], p[3]);
- return true;
- }
- return false;
-}
-
static inline int
-nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
+nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror)
{
- return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
+ return mirror->mirror_ds->ds_versions[0].version;
}
struct nfs4_ff_layout_ds *
@@ -207,6 +195,7 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
struct nfs4_ff_layout_mirror *mirror, u64 offset,
u64 length, int status, enum nfs_opnum4 opnum,
gfp_t gfp_flags);
+void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg);
int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
void ff_layout_free_ds_ioerr(struct list_head *head);
unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
@@ -214,23 +203,23 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
struct list_head *head,
unsigned int maxnum);
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
-int
-nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
- u32 mirror_idx,
- nfs4_stateid *stateid);
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror);
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+ nfs4_stateid *stateid);
struct nfs4_pnfs_ds *
-nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror,
bool fail_return);
struct rpc_clnt *
-nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
- u32 ds_idx,
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
struct nfs_client *ds_clp,
struct inode *inode);
-const struct cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
- u32 ds_idx, const struct cred *mdscred);
+const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+ const struct pnfs_layout_range *range,
+ const struct cred *mdscred);
bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 11766a74216d..a809989807d6 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -183,56 +183,6 @@ out_err:
return NULL;
}
-static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
- struct nfs4_deviceid_node *devid)
-{
- nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid);
- if (!ff_layout_has_available_ds(lseg))
- pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
- lseg);
-}
-
-static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
- struct nfs4_ff_layout_mirror *mirror,
- bool create)
-{
- if (mirror == NULL || IS_ERR(mirror->mirror_ds))
- goto outerr;
- if (mirror->mirror_ds == NULL) {
- if (create) {
- struct nfs4_deviceid_node *node;
- struct pnfs_layout_hdr *lh = lseg->pls_layout;
- struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
-
- node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
- &mirror->devid, lh->plh_lc_cred,
- GFP_KERNEL);
- if (node)
- mirror_ds = FF_LAYOUT_MIRROR_DS(node);
-
- /* check for race with another call to this function */
- if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
- mirror_ds != ERR_PTR(-ENODEV))
- nfs4_put_deviceid_node(node);
- } else
- goto outerr;
- }
-
- if (IS_ERR(mirror->mirror_ds))
- goto outerr;
-
- if (mirror->mirror_ds->ds == NULL) {
- struct nfs4_deviceid_node *devid;
- devid = &mirror->mirror_ds->id_node;
- ff_layout_mark_devid_invalid(lseg, devid);
- return false;
- }
- return true;
-outerr:
- pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
- return false;
-}
-
static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
u64 offset, u64 length)
{
@@ -326,7 +276,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
spin_lock(&flo->generic_hdr.plh_inode->i_lock);
ff_layout_add_ds_error_locked(flo, dserr);
spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
-
return 0;
}
@@ -353,46 +302,54 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
}
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
- struct nfs_fh *fh = NULL;
-
- if (!ff_layout_mirror_valid(lseg, mirror, false)) {
- pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
- __func__, mirror_idx);
- goto out;
- }
-
/* FIXME: For now assume there is only 1 version available for the DS */
- fh = &mirror->fh_versions[0];
-out:
- return fh;
+ return &mirror->fh_versions[0];
}
-int
-nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
- u32 mirror_idx,
- nfs4_stateid *stateid)
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+ nfs4_stateid *stateid)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
+ if (nfs4_ff_layout_ds_version(mirror) == 4)
+ nfs4_stateid_copy(stateid, &mirror->stateid);
+}
- if (!ff_layout_mirror_valid(lseg, mirror, false)) {
- pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
- __func__, mirror_idx);
- goto out;
+static bool
+ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror == NULL)
+ goto outerr;
+ if (mirror->mirror_ds == NULL) {
+ struct nfs4_deviceid_node *node;
+ struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
+
+ node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode),
+ &mirror->devid, lo->plh_lc_cred,
+ GFP_KERNEL);
+ if (node)
+ mirror_ds = FF_LAYOUT_MIRROR_DS(node);
+
+ /* check for race with another call to this function */
+ if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+ mirror_ds != ERR_PTR(-ENODEV))
+ nfs4_put_deviceid_node(node);
}
- nfs4_stateid_copy(stateid, &mirror->stateid);
- return 1;
-out:
- return 0;
+ if (IS_ERR(mirror->mirror_ds))
+ goto outerr;
+
+ return true;
+outerr:
+ return false;
}
/**
* nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
* @lseg: the layout segment we're operating on
- * @ds_idx: index of the DS to use
+ * @mirror: layout mirror describing the DS to use
* @fail_return: return layout on connect failure?
*
* Try to prepare a DS connection to accept an RPC call. This involves
@@ -407,26 +364,18 @@ out:
* Returns a pointer to a connected DS object on success or NULL on failure.
*/
struct nfs4_pnfs_ds *
-nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror,
bool fail_return)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
struct nfs4_pnfs_ds *ds = NULL;
- struct nfs4_deviceid_node *devid;
struct inode *ino = lseg->pls_layout->plh_inode;
struct nfs_server *s = NFS_SERVER(ino);
unsigned int max_payload;
int status;
- if (!ff_layout_mirror_valid(lseg, mirror, true)) {
- pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
- __func__, ds_idx);
- goto out;
- }
-
- devid = &mirror->mirror_ds->id_node;
- if (ff_layout_test_devid_unavailable(devid))
- goto out_fail;
+ if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
+ goto noconnect;
ds = mirror->mirror_ds->ds;
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -437,8 +386,8 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
/* FIXME: For now we assume the server sent only one version of NFS
* to use for the DS.
*/
- status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
- dataserver_retrans,
+ status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node,
+ dataserver_timeo, dataserver_retrans,
mirror->mirror_ds->ds_versions[0].version,
mirror->mirror_ds->ds_versions[0].minor_version);
@@ -453,11 +402,12 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror->mirror_ds->ds_versions[0].wsize = max_payload;
goto out;
}
-out_fail:
+noconnect:
ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
+ ff_layout_send_layouterror(lseg);
if (fail_return || !ff_layout_has_available_ds(lseg))
pnfs_error_mark_layout_for_return(ino, lseg);
ds = NULL;
@@ -466,14 +416,14 @@ out:
}
const struct cred *
-ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+ const struct pnfs_layout_range *range,
const struct cred *mdscred)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
const struct cred *cred;
if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
- cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
+ cred = ff_layout_get_mirror_cred(mirror, range->iomode);
if (!cred)
cred = get_cred(mdscred);
} else {
@@ -483,15 +433,18 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
}
/**
-* Find or create a DS rpc client with th MDS server rpc client auth flavor
-* in the nfs_client cl_ds_clients list.
-*/
+ * nfs4_ff_find_or_create_ds_client - Find or create a DS rpc client
+ * @mirror: pointer to the mirror
+ * @ds_clp: nfs_client for the DS
+ * @inode: pointer to inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
struct rpc_clnt *
-nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
struct nfs_client *ds_clp, struct inode *inode)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
-
switch (mirror->mirror_ds->ds_versions[0].version) {
case 3:
/* For NFSv3 DS, flavor is set when creating DS connections */
@@ -608,7 +561,7 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
if (IS_ERR(mirror->mirror_ds))
continue;
devid = &mirror->mirror_ds->id_node;
- if (!ff_layout_test_devid_unavailable(devid))
+ if (!nfs4_test_deviceid_unavailable(devid))
return true;
}
}
@@ -629,7 +582,7 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
if (!mirror->mirror_ds)
continue;
devid = &mirror->mirror_ds->id_node;
- if (ff_layout_test_devid_unavailable(devid))
+ if (nfs4_test_deviceid_unavailable(devid))
return false;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 094775ea0781..414a90d48493 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -143,6 +143,7 @@ EXPORT_SYMBOL_GPL(nfs_sync_inode);
/**
* nfs_sync_mapping - helper to flush all mmapped dirty data to disk
+ * @mapping: pointer to struct address_space
*/
int nfs_sync_mapping(struct address_space *mapping)
{
@@ -1184,8 +1185,8 @@ int nfs_attribute_cache_expired(struct inode *inode)
/**
* nfs_revalidate_inode - Revalidate the inode attributes
- * @server - pointer to nfs_server struct
- * @inode - pointer to inode struct
+ * @server: pointer to nfs_server struct
+ * @inode: pointer to inode struct
*
* Updates inode attribute information by retrieving the data from the server.
*/
@@ -1255,8 +1256,8 @@ out:
/**
* nfs_revalidate_mapping - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
+ * @inode: pointer to host inode
+ * @mapping: pointer to mapping
*/
int nfs_revalidate_mapping(struct inode *inode,
struct address_space *mapping)
@@ -1371,8 +1372,8 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/**
* nfs_check_inode_attributes - verify consistency of the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* Verifies the attribute cache. If we have just changed the attributes,
* so that fattr carries weak cache consistency data, then it may
@@ -1572,8 +1573,8 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
/**
* nfs_inode_attrs_need_update - check if the inode attributes need updating
- * @inode - pointer to inode
- * @fattr - attributes
+ * @inode: pointer to inode
+ * @fattr: attributes
*
* Attempt to divine whether or not an RPC call reply carrying stale
* attributes got scheduled after another call carrying updated ones.
@@ -1614,8 +1615,8 @@ static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr
/**
* nfs_refresh_inode - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* Check that an RPC call that returned attributes has not overlapped with
* other recent updates of the inode metadata, then decide whether it is
@@ -1649,8 +1650,8 @@ static int nfs_post_op_update_inode_locked(struct inode *inode,
/**
* nfs_post_op_update_inode - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* After an operation that has changed the inode metadata, mark the
* attribute cache as being invalid, then try to update it.
@@ -1679,8 +1680,8 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
/**
* nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* After an operation that has changed the inode metadata, mark the
* attribute cache as being invalid, then try to update it. Fake up
@@ -1731,8 +1732,8 @@ out_noforce:
/**
* nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* After an operation that has changed the inode metadata, mark the
* attribute cache as being invalid, then try to update it. Fake up
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b1e577302518..c7cf23ae6597 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -69,7 +69,8 @@ struct nfs_clone_mount {
* Maximum number of pages that readdir can use for creating
* a vmapped array of pages.
*/
-#define NFS_MAX_READDIR_PAGES 8
+#define NFS_MAX_READDIR_PAGES 64
+#define NFS_MAX_READDIR_RAPAGES 8
struct nfs_client_initdata {
unsigned long init_flags;
@@ -755,6 +756,7 @@ static inline bool nfs_error_is_fatal(int err)
{
switch (err) {
case -ERESTARTSYS:
+ case -EINTR:
case -EACCES:
case -EDQUOT:
case -EFBIG:
@@ -763,6 +765,7 @@ static inline bool nfs_error_is_fatal(int err)
case -EROFS:
case -ESTALE:
case -E2BIG:
+ case -ENOMEM:
return true;
default:
return false;
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
index 9034b4926909..5088fda9b453 100644
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c
@@ -25,7 +25,7 @@ static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
/**
* nfs_start_io_read - declare the file is being used for buffered reads
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered read operation is about to start, and ensure
* that we block all direct I/O.
@@ -56,7 +56,7 @@ nfs_start_io_read(struct inode *inode)
/**
* nfs_end_io_read - declare that the buffered read operation is done
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered read operation is done, and release the shared
* lock on inode->i_rwsem.
@@ -69,7 +69,7 @@ nfs_end_io_read(struct inode *inode)
/**
* nfs_start_io_write - declare the file is being used for buffered writes
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered read operation is about to start, and ensure
* that we block all direct I/O.
@@ -83,7 +83,7 @@ nfs_start_io_write(struct inode *inode)
/**
* nfs_end_io_write - declare that the buffered write operation is done
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered write operation is done, and release the
* lock on inode->i_rwsem.
@@ -105,7 +105,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
/**
* nfs_end_io_direct - declare the file is being used for direct i/o
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a direct I/O operation is about to start, and ensure
* that we block all buffered I/O.
@@ -136,7 +136,7 @@ nfs_start_io_direct(struct inode *inode)
/**
* nfs_end_io_direct - declare that the direct i/o operation is done
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a direct I/O operation is done, and release the shared
* lock on inode->i_rwsem.
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index e5686be67be8..15f099a24c29 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -221,10 +221,10 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
/**
* nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @dentry - parent directory
- * @fh - filehandle for new root dentry
- * @fattr - attributes for new root inode
- * @authflavor - security flavor to use when performing the mount
+ * @dentry: parent directory
+ * @fh: filehandle for new root dentry
+ * @fattr: attributes for new root inode
+ * @authflavor: security flavor to use when performing the mount
*
*/
struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh,
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 350675e3ed47..a7ed29de0a40 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -22,6 +22,7 @@
#include <linux/nfs.h>
#include <linux/nfs2.h>
#include <linux/nfs_fs.h>
+#include "nfstrace.h"
#include "internal.h"
#define NFSDBG_FACILITY NFSDBG_XDR
@@ -55,42 +56,16 @@
#define NFS_attrstat_sz (1+NFS_fattr_sz)
#define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz)
-#define NFS_readlinkres_sz (2)
-#define NFS_readres_sz (1+NFS_fattr_sz+1)
+#define NFS_readlinkres_sz (2+1)
+#define NFS_readres_sz (1+NFS_fattr_sz+1+1)
#define NFS_writeres_sz (NFS_attrstat_sz)
#define NFS_stat_sz (1)
-#define NFS_readdirres_sz (1)
+#define NFS_readdirres_sz (1+1)
#define NFS_statfsres_sz (1+NFS_info_sz)
static int nfs_stat_to_errno(enum nfs_stat);
/*
- * While encoding arguments, set up the reply buffer in advance to
- * receive reply data directly into the page cache.
- */
-static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
- unsigned int base, unsigned int len,
- unsigned int bufsize)
-{
- struct rpc_auth *auth = req->rq_cred->cr_auth;
- unsigned int replen;
-
- replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
-}
-
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("NFS: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NFSv2 basic data types
*
* Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
@@ -110,8 +85,8 @@ static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
recvd = xdr_read_pages(xdr, count);
if (unlikely(count > recvd))
@@ -125,9 +100,6 @@ out_cheating:
"count %u > recvd %u\n", count, recvd);
count = recvd;
goto out;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -157,13 +129,16 @@ static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != cpu_to_be32(NFS_OK)))
+ goto out_status;
+ *status = 0;
+ return 0;
+out_status:
*status = be32_to_cpup(p);
+ trace_nfs_xdr_status((int)*status);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -205,14 +180,11 @@ static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
__be32 *p;
p = xdr_inline_decode(xdr, NFS2_FHSIZE);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fh->size = NFS2_FHSIZE;
memcpy(fh->data, p, NFS2_FHSIZE);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -282,8 +254,8 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fattr->valid |= NFS_ATTR_FATTR_V2;
@@ -325,9 +297,6 @@ out_uid:
out_gid:
dprintk("NFS: returned invalid gid\n");
return -EINVAL;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -416,23 +385,20 @@ static int decode_filename_inline(struct xdr_stream *xdr,
u32 count;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
if (count > NFS3_MAXNAMLEN)
goto out_nametoolong;
p = xdr_inline_decode(xdr, count);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
*name = (const char *)p;
*length = count;
return 0;
out_nametoolong:
dprintk("NFS: returned filename too long: %u\n", count);
return -ENAMETOOLONG;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -455,8 +421,8 @@ static int decode_path(struct xdr_stream *xdr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
length = be32_to_cpup(p);
if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
goto out_size;
@@ -472,9 +438,6 @@ out_cheating:
dprintk("NFS: server cheating in pathname result: "
"length %u > received %u\n", length, recvd);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -615,8 +578,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
const struct nfs_readlinkargs *args = data;
encode_fhandle(xdr, args->fh);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->pglen, NFS_readlinkres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, NFS_readlinkres_sz);
}
/*
@@ -651,8 +614,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
const struct nfs_pgio_args *args = data;
encode_readargs(xdr, args);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->count, NFS_readres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, NFS_readres_sz);
req->rq_rcv_buf.flags |= XDRBUF_READ;
}
@@ -809,8 +772,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
const struct nfs_readdirargs *args = data;
encode_readdirargs(xdr, args);
- prepare_reply_buffer(req, args->pages, 0,
- args->count, NFS_readdirres_sz);
+ rpc_prepare_reply_pages(req, args->pages, 0,
+ args->count, NFS_readdirres_sz);
}
/*
@@ -951,12 +914,12 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
int error;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p++ == xdr_zero) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p++ == xdr_zero)
return -EAGAIN;
entry->eof = 1;
@@ -964,8 +927,8 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
}
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
entry->ino = be32_to_cpup(p);
error = decode_filename_inline(xdr, &entry->name, &entry->len);
@@ -978,17 +941,13 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
*/
entry->prev_cookie = entry->cookie;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
entry->cookie = be32_to_cpup(p);
entry->d_type = DT_UNKNOWN;
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EAGAIN;
}
/*
@@ -1052,17 +1011,14 @@ static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
__be32 *p;
p = xdr_inline_decode(xdr, NFS_info_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->tsize = be32_to_cpup(p++);
result->bsize = be32_to_cpup(p++);
result->blocks = be32_to_cpup(p++);
result->bfree = be32_to_cpup(p++);
result->bavail = be32_to_cpup(p);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9fce18548f7e..c5c3fc6e6c60 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -222,8 +222,6 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
switch (status) {
case 0:
status = nfs_refresh_inode(inode, fattr);
- set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
- set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl);
break;
case -EPFNOSUPPORT:
case -EPROTONOSUPPORT:
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 78df4eb60f85..110358f4986d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -21,6 +21,7 @@
#include <linux/nfs3.h>
#include <linux/nfs_fs.h>
#include <linux/nfsacl.h>
+#include "nfstrace.h"
#include "internal.h"
#define NFSDBG_FACILITY NFSDBG_XDR
@@ -68,13 +69,13 @@
#define NFS3_removeres_sz (NFS3_setattrres_sz)
#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1)
-#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1)
-#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3)
+#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+1)
+#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+1)
#define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4)
#define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
#define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz))
#define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
-#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2)
+#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+1)
#define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13)
#define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12)
#define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6)
@@ -84,7 +85,7 @@
#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \
XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \
- XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1)
#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz)
static int nfs3_stat_to_errno(enum nfs_stat);
@@ -104,32 +105,6 @@ static const umode_t nfs_type2fmt[] = {
};
/*
- * While encoding arguments, set up the reply buffer in advance to
- * receive reply data directly into the page cache.
- */
-static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
- unsigned int base, unsigned int len,
- unsigned int bufsize)
-{
- struct rpc_auth *auth = req->rq_cred->cr_auth;
- unsigned int replen;
-
- replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
-}
-
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("NFS: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NFSv3 basic data types
*
* Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
@@ -151,13 +126,10 @@ static int decode_uint32(struct xdr_stream *xdr, u32 *value)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
*value = be32_to_cpup(p);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_uint64(struct xdr_stream *xdr, u64 *value)
@@ -165,13 +137,10 @@ static int decode_uint64(struct xdr_stream *xdr, u64 *value)
__be32 *p;
p = xdr_inline_decode(xdr, 8);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
xdr_decode_hyper(p, value);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -211,14 +180,14 @@ static int decode_inline_filename3(struct xdr_stream *xdr,
u32 count;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
if (count > NFS3_MAXNAMLEN)
goto out_nametoolong;
p = xdr_inline_decode(xdr, count);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
*name = (const char *)p;
*length = count;
return 0;
@@ -226,9 +195,6 @@ static int decode_inline_filename3(struct xdr_stream *xdr,
out_nametoolong:
dprintk("NFS: returned filename too long: %u\n", count);
return -ENAMETOOLONG;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -249,8 +215,8 @@ static int decode_nfspath3(struct xdr_stream *xdr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
goto out_nametoolong;
@@ -267,9 +233,6 @@ out_cheating:
dprintk("NFS: server cheating in pathname result: "
"count %u > recvd %u\n", count, recvd);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -303,13 +266,10 @@ static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -330,13 +290,10 @@ static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier *
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
memcpy(verifier->data, p, NFS3_WRITEVERFSIZE);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -364,13 +321,16 @@ static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != cpu_to_be32(NFS3_OK)))
+ goto out_status;
+ *status = 0;
+ return 0;
+out_status:
*status = be32_to_cpup(p);
+ trace_nfs_xdr_status((int)*status);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -453,23 +413,20 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
length = be32_to_cpup(p++);
if (unlikely(length > NFS3_FHSIZE))
goto out_toobig;
p = xdr_inline_decode(xdr, length);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fh->size = length;
memcpy(fh->data, p, length);
return 0;
out_toobig:
dprintk("NFS: file handle size (%u) too big\n", length);
return -E2BIG;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static void zero_nfs_fh3(struct nfs_fh *fh)
@@ -655,8 +612,8 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
p = xdr_decode_ftype3(p, &fmode);
@@ -690,9 +647,6 @@ out_uid:
out_gid:
dprintk("NFS: returned invalid gid\n");
return -EINVAL;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -710,14 +664,11 @@ static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
if (*p != xdr_zero)
return decode_fattr3(xdr, fattr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -733,8 +684,8 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fattr->valid |= NFS_ATTR_FATTR_PRESIZE
| NFS_ATTR_FATTR_PRECHANGE
@@ -747,9 +698,6 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -773,14 +721,11 @@ static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
if (*p != xdr_zero)
return decode_wcc_attr(xdr, fattr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
@@ -808,15 +753,12 @@ out:
static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
{
__be32 *p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
if (*p != xdr_zero)
return decode_nfs_fh3(xdr, fh);
zero_nfs_fh3(fh);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -953,8 +895,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
const struct nfs3_readlinkargs *args = data;
encode_nfs_fh3(xdr, args->fh);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->pglen, NFS3_readlinkres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, NFS3_readlinkres_sz);
}
/*
@@ -986,8 +928,8 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
unsigned int replen = args->replen ? args->replen : NFS3_readres_sz;
encode_read3args(xdr, args);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->count, replen);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, replen);
req->rq_rcv_buf.flags |= XDRBUF_READ;
}
@@ -1279,7 +1221,7 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
const struct nfs3_readdirargs *args = data;
encode_readdir3args(xdr, args);
- prepare_reply_buffer(req, args->pages, 0,
+ rpc_prepare_reply_pages(req, args->pages, 0,
args->count, NFS3_readdirres_sz);
}
@@ -1321,7 +1263,7 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
const struct nfs3_readdirargs *args = data;
encode_readdirplus3args(xdr, args);
- prepare_reply_buffer(req, args->pages, 0,
+ rpc_prepare_reply_pages(req, args->pages, 0,
args->count, NFS3_readdirres_sz);
}
@@ -1366,7 +1308,7 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
encode_nfs_fh3(xdr, args->fh);
encode_uint32(xdr, args->mask);
if (args->mask & (NFS_ACL | NFS_DFACL)) {
- prepare_reply_buffer(req, args->pages, 0,
+ rpc_prepare_reply_pages(req, args->pages, 0,
NFSACL_MAXPAGES << PAGE_SHIFT,
ACL3_getaclres_sz);
req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
@@ -1643,8 +1585,8 @@ static int decode_read3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 + 4 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p++);
eof = be32_to_cpup(p++);
ocount = be32_to_cpup(p++);
@@ -1667,9 +1609,6 @@ out_cheating:
count = recvd;
eof = 0;
goto out;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -1690,7 +1629,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
result->op_status = status;
if (status != NFS3_OK)
goto out_status;
- result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2);
+ result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2);
error = decode_read3resok(xdr, result);
out:
return error;
@@ -1731,22 +1670,18 @@ static int decode_write3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->count = be32_to_cpup(p++);
result->verf->committed = be32_to_cpup(p++);
if (unlikely(result->verf->committed > NFS_FILE_SYNC))
goto out_badvalue;
if (decode_writeverf3(xdr, &result->verf->verifier))
- goto out_eio;
+ return -EIO;
return result->count;
out_badvalue:
dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
-out_eio:
- return -EIO;
}
static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -2010,12 +1945,12 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
u64 new_cookie;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p == xdr_zero) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p == xdr_zero)
return -EAGAIN;
entry->eof = 1;
@@ -2051,8 +1986,8 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
/* In fact, a post_op_fh3: */
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p != xdr_zero) {
error = decode_nfs_fh3(xdr, entry->fh);
if (unlikely(error)) {
@@ -2069,9 +2004,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EAGAIN;
out_truncated:
dprintk("NFS: directory entry contains invalid file handle\n");
*entry = old;
@@ -2183,8 +2115,8 @@ static int decode_fsstat3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 8 * 6 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
p = xdr_decode_size3(p, &result->tbytes);
p = xdr_decode_size3(p, &result->fbytes);
p = xdr_decode_size3(p, &result->abytes);
@@ -2193,9 +2125,6 @@ static int decode_fsstat3resok(struct xdr_stream *xdr,
xdr_decode_size3(p, &result->afiles);
/* ignore invarsec */
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
@@ -2255,8 +2184,8 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->rtmax = be32_to_cpup(p++);
result->rtpref = be32_to_cpup(p++);
result->rtmult = be32_to_cpup(p++);
@@ -2270,9 +2199,6 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
/* ignore properties */
result->lease_time = 0;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
@@ -2328,15 +2254,12 @@ static int decode_pathconf3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 * 6);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->max_link = be32_to_cpup(p++);
result->max_namelen = be32_to_cpup(p);
/* ignore remaining fields */
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 19ec38f85ce0..901cca7542f9 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -20,5 +20,8 @@ loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+ const struct nfs42_layout_error *errors,
+ size_t n);
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index fed06fd9998d..5196bfa7894d 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -329,9 +329,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
};
ssize_t err, err2;
- if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
- return -EOPNOTSUPP;
-
src_lock = nfs_get_lock_context(nfs_file_open_context(src));
if (IS_ERR(src_lock))
return PTR_ERR(src_lock);
@@ -672,6 +669,170 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
return 0;
}
+static struct nfs42_layouterror_data *
+nfs42_alloc_layouterror_data(struct pnfs_layout_segment *lseg, gfp_t gfp_flags)
+{
+ struct nfs42_layouterror_data *data;
+ struct inode *inode = lseg->pls_layout->plh_inode;
+
+ data = kzalloc(sizeof(*data), gfp_flags);
+ if (data) {
+ data->args.inode = data->inode = nfs_igrab_and_active(inode);
+ if (data->inode) {
+ data->lseg = pnfs_get_lseg(lseg);
+ if (data->lseg)
+ return data;
+ nfs_iput_and_deactive(data->inode);
+ }
+ kfree(data);
+ }
+ return NULL;
+}
+
+static void
+nfs42_free_layouterror_data(struct nfs42_layouterror_data *data)
+{
+ pnfs_put_lseg(data->lseg);
+ nfs_iput_and_deactive(data->inode);
+ kfree(data);
+}
+
+static void
+nfs42_layouterror_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+ unsigned i;
+
+ spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ rpc_exit(task, 0);
+ return;
+ }
+ for (i = 0; i < data->args.num_errors; i++)
+ nfs4_stateid_copy(&data->args.errors[i].stateid,
+ &lo->plh_stateid);
+ spin_unlock(&inode->i_lock);
+ nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+ &data->res.seq_res, task);
+}
+
+static void
+nfs42_layouterror_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ switch (task->tk_status) {
+ case 0:
+ break;
+ case -NFS4ERR_BADHANDLE:
+ case -ESTALE:
+ pnfs_destroy_layout(NFS_I(inode));
+ break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match(&data->args.errors[0].stateid,
+ &lo->plh_stateid)) {
+ LIST_HEAD(head);
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ nfs_commit_inode(inode, 0);
+ } else
+ spin_unlock(&inode->i_lock);
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(&data->args.errors[0].stateid,
+ &lo->plh_stateid)) {
+ /* Do we need to delay before resending? */
+ if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+ &data->args.errors[0].stateid))
+ rpc_delay(task, HZ);
+ rpc_restart_call_prepare(task);
+ }
+ spin_unlock(&inode->i_lock);
+ break;
+ case -ENOTSUPP:
+ case -EOPNOTSUPP:
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR;
+ }
+}
+
+static void
+nfs42_layouterror_release(void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+
+ nfs42_free_layouterror_data(data);
+}
+
+static const struct rpc_call_ops nfs42_layouterror_ops = {
+ .rpc_call_prepare = nfs42_layouterror_prepare,
+ .rpc_call_done = nfs42_layouterror_done,
+ .rpc_release = nfs42_layouterror_release,
+};
+
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+ const struct nfs42_layout_error *errors, size_t n)
+{
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct nfs42_layouterror_data *data;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTERROR],
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_message = &msg,
+ .callback_ops = &nfs42_layouterror_ops,
+ .flags = RPC_TASK_ASYNC,
+ };
+ unsigned int i;
+
+ if (!nfs_server_capable(inode, NFS_CAP_LAYOUTERROR))
+ return -EOPNOTSUPP;
+ if (n > NFS42_LAYOUTERROR_MAX)
+ return -EINVAL;
+ data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
+ for (i = 0; i < n; i++) {
+ data->args.errors[i] = errors[i];
+ data->args.num_errors++;
+ data->res.num_errors++;
+ }
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ task_setup.callback_data = data;
+ task_setup.rpc_client = NFS_SERVER(inode)->client;
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0);
+ task = rpc_run_task(&task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs42_proc_layouterror);
+
static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
struct file *dst_f, struct nfs_lock_context *src_lock,
struct nfs_lock_context *dst_lock, loff_t src_offset,
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 69f72ed2bf87..aed865a84629 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -51,6 +51,15 @@
1 /* opaque devaddr4 length */ + \
XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
+#define encode_device_error_maxsz (XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+ 1 /* status */ + 1 /* opnum */)
+#define encode_layouterror_maxsz (op_decode_hdr_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */ + \
+ encode_stateid_maxsz + \
+ 1 /* Array size */ + \
+ encode_device_error_maxsz)
+#define decode_layouterror_maxsz (op_decode_hdr_maxsz)
#define encode_clone_maxsz (encode_stateid_maxsz + \
encode_stateid_maxsz + \
2 /* src offset */ + \
@@ -59,43 +68,53 @@
#define decode_clone_maxsz (op_decode_hdr_maxsz)
#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_allocate_maxsz + \
encode_getattr_maxsz)
#define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_allocate_maxsz + \
decode_getattr_maxsz)
#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_savefh_maxsz + \
encode_putfh_maxsz + \
encode_copy_maxsz + \
encode_commit_maxsz)
#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_savefh_maxsz + \
decode_putfh_maxsz + \
decode_copy_maxsz + \
decode_commit_maxsz)
#define NFS4_enc_offload_cancel_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_offload_cancel_maxsz)
#define NFS4_dec_offload_cancel_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_offload_cancel_maxsz)
#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_deallocate_maxsz + \
encode_getattr_maxsz)
#define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_deallocate_maxsz + \
decode_getattr_maxsz)
#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_seek_maxsz)
#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_seek_maxsz)
#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \
@@ -106,6 +125,16 @@
decode_sequence_maxsz + \
decode_putfh_maxsz + \
PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
+#define NFS4_enc_layouterror_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ NFS42_LAYOUTERROR_MAX * \
+ encode_layouterror_maxsz)
+#define NFS4_dec_layouterror_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ NFS42_LAYOUTERROR_MAX * \
+ decode_layouterror_maxsz)
#define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -223,6 +252,34 @@ static void encode_clone(struct xdr_stream *xdr,
xdr_encode_hyper(p, args->count);
}
+static void encode_device_error(struct xdr_stream *xdr,
+ const struct nfs42_device_error *error)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 2*4);
+ p = xdr_encode_opaque_fixed(p, error->dev_id.data,
+ NFS4_DEVICEID4_SIZE);
+ *p++ = cpu_to_be32(error->status);
+ *p = cpu_to_be32(error->opnum);
+}
+
+static void encode_layouterror(struct xdr_stream *xdr,
+ const struct nfs42_layout_error *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTERROR, decode_layouterror_maxsz, hdr);
+ p = reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, args->offset);
+ p = xdr_encode_hyper(p, args->length);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(1);
+ encode_device_error(xdr, &args->errors[0]);
+}
+
/*
* Encode ALLOCATE request
*/
@@ -381,6 +438,27 @@ static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
encode_nops(&hdr);
}
+/*
+ * Encode LAYOUTERROR request
+ */
+static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_layouterror_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ int i;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ for (i = 0; i < args->num_errors; i++)
+ encode_layouterror(xdr, &args->errors[i], &hdr);
+ encode_nops(&hdr);
+}
+
static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -394,7 +472,7 @@ static int decode_write_response(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
count = be32_to_cpup(p);
if (count > 1)
return -EREMOTEIO;
@@ -402,18 +480,14 @@ static int decode_write_response(struct xdr_stream *xdr,
status = decode_opaque_fixed(xdr, &res->stateid,
NFS4_STATEID_SIZE);
if (unlikely(status))
- goto out_overflow;
+ return -EIO;
}
p = xdr_inline_decode(xdr, 8 + 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->count);
res->verifier.committed = be32_to_cpup(p);
return decode_verifier(xdr, &res->verifier.verifier);
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_copy_requirements(struct xdr_stream *xdr,
@@ -422,14 +496,11 @@ static int decode_copy_requirements(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4 + 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->consecutive = be32_to_cpup(p++);
res->synchronous = be32_to_cpup(p++);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
@@ -474,15 +545,11 @@ static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
p = xdr_inline_decode(xdr, 4 + 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->sr_eof = be32_to_cpup(p++);
p = xdr_decode_hyper(p, &res->sr_offset);
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_layoutstats(struct xdr_stream *xdr)
@@ -495,6 +562,11 @@ static int decode_clone(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_CLONE);
}
+static int decode_layouterror(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_LAYOUTERROR);
+}
+
/*
* Decode ALLOCATE request
*/
@@ -704,4 +776,30 @@ out:
return status;
}
+/*
+ * Decode LAYOUTERROR request
+ */
+static int nfs4_xdr_dec_layouterror(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_layouterror_res *res = data;
+ struct compound_hdr hdr;
+ int status, i;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+
+ for (i = 0; i < res->num_errors && status == 0; i++)
+ status = decode_layouterror(xdr);
+out:
+ res->rpc_status = status;
+ return status;
+}
+
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 2548405da1f7..1339ede979af 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -42,7 +42,7 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
}
#ifdef CONFIG_NFS_V4_1
-/**
+/*
* Per auth flavor data server rpc clients
*/
struct nfs4_ds_server {
@@ -51,7 +51,9 @@ struct nfs4_ds_server {
};
/**
- * Common lookup case for DS I/O
+ * nfs4_find_ds_client - Common lookup case for DS I/O
+ * @ds_clp: pointer to the DS's nfs_client
+ * @flavor: rpc auth flavour to match
*/
static struct nfs4_ds_server *
nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
@@ -118,9 +120,13 @@ nfs4_free_ds_server(struct nfs4_ds_server *dss)
}
/**
-* Find or create a DS rpc client with th MDS server rpc client auth flavor
-* in the nfs_client cl_ds_clients list.
-*/
+ * nfs4_find_or_create_ds_client - Find or create a DS rpc client
+ * @ds_clp: pointer to the DS's nfs_client
+ * @inode: pointer to the inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
struct rpc_clnt *
nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode)
{
@@ -145,7 +151,6 @@ static void
nfs4_shutdown_ds_clients(struct nfs_client *clp)
{
struct nfs4_ds_server *dss;
- LIST_HEAD(shutdown_list);
while (!list_empty(&clp->cl_ds_clients)) {
dss = list_entry(clp->cl_ds_clients.next,
@@ -284,7 +289,7 @@ static int nfs4_init_callback(struct nfs_client *clp)
/**
* nfs40_init_client - nfs_client initialization tasks for NFSv4.0
- * @clp - nfs_client to initialize
+ * @clp: nfs_client to initialize
*
* Returns zero on success, or a negative errno if some error occurred.
*/
@@ -312,7 +317,7 @@ int nfs40_init_client(struct nfs_client *clp)
/**
* nfs41_init_client - nfs_client initialization tasks for NFSv4.1+
- * @clp - nfs_client to initialize
+ * @clp: nfs_client to initialize
*
* Returns zero on success, or a negative errno if some error occurred.
*/
@@ -360,9 +365,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
* nfs4_init_client - Initialise an NFS4 client record
*
* @clp: nfs_client to initialise
- * @timeparms: timeout parameters for underlying RPC transport
- * @ip_addr: callback IP address in presentation format
- * @authflavor: authentication flavor for underlying RPC transport
+ * @cl_init: pointer to nfs_client_initdata
*
* Returns pointer to an NFS client, or an ERR_PTR value.
*/
@@ -649,13 +652,13 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1,
/**
* nfs4_detect_session_trunking - Checks for session trunking.
- *
- * Called after a successful EXCHANGE_ID on a multi-addr connection.
- * Upon success, add the transport.
- *
* @clp: original mount nfs_client
* @res: result structure from an exchange_id using the original mount
* nfs_client with a new multi_addr transport
+ * @xprt: pointer to the transport to add.
+ *
+ * Called after a successful EXCHANGE_ID on a multi-addr connection.
+ * Upon success, add the transport.
*
* Returns zero on success, otherwise -EINVAL
*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 45b2322e092d..00d17198ee12 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -133,8 +133,10 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t count, unsigned int flags)
{
+ if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY))
+ return -EOPNOTSUPP;
if (file_inode(file_in) == file_inode(file_out))
- return -EINVAL;
+ return -EOPNOTSUPP;
return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 24f06dcc2b08..2e460c33ae48 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -137,6 +137,7 @@ static size_t nfs_parse_server_name(char *string, size_t len,
/**
* nfs_find_best_sec - Find a security mechanism supported locally
+ * @clnt: pointer to rpc_clnt
* @server: NFS server struct
* @flavors: List of security tuples returned by SECINFO procedure
*
@@ -288,8 +289,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
/**
* nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @dentry - parent directory
- * @locations - array of NFSv4 server location information
+ * @dentry: parent directory
+ * @locations: array of NFSv4 server location information
*
*/
static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 557a5d636183..741ff8c9c6ed 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -730,33 +730,41 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
res->sr_slot = NULL;
}
+static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot,
+ u32 seqnr)
+{
+ if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0)
+ slot->seq_nr_highest_sent = seqnr;
+}
+static void nfs4_slot_sequence_acked(struct nfs4_slot *slot,
+ u32 seqnr)
+{
+ slot->seq_nr_highest_sent = seqnr;
+ slot->seq_nr_last_acked = seqnr;
+}
+
static int nfs41_sequence_process(struct rpc_task *task,
struct nfs4_sequence_res *res)
{
struct nfs4_session *session;
struct nfs4_slot *slot = res->sr_slot;
struct nfs_client *clp;
- bool interrupted = false;
int ret = 1;
if (slot == NULL)
goto out_noaction;
/* don't increment the sequence number if the task wasn't sent */
- if (!RPC_WAS_SENT(task))
+ if (!RPC_WAS_SENT(task) || slot->seq_done)
goto out;
session = slot->table->session;
- if (slot->interrupted) {
- if (res->sr_status != -NFS4ERR_DELAY)
- slot->interrupted = 0;
- interrupted = true;
- }
-
trace_nfs4_sequence_done(session, res);
/* Check the SEQUENCE operation status */
switch (res->sr_status) {
case 0:
+ /* Mark this sequence number as having been acked */
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
/* Update the slot's sequence and clientid lease timer */
slot->seq_done = 1;
clp = session->clp;
@@ -771,9 +779,9 @@ static int nfs41_sequence_process(struct rpc_task *task,
* sr_status remains 1 if an RPC level error occurred.
* The server may or may not have processed the sequence
* operation..
- * Mark the slot as having hosted an interrupted RPC call.
*/
- slot->interrupted = 1;
+ nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
+ slot->seq_done = 1;
goto out;
case -NFS4ERR_DELAY:
/* The server detected a resend of the RPC call and
@@ -784,6 +792,7 @@ static int nfs41_sequence_process(struct rpc_task *task,
__func__,
slot->slot_nr,
slot->seq_nr);
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
goto out_retry;
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_SEQ_FALSE_RETRY:
@@ -791,6 +800,7 @@ static int nfs41_sequence_process(struct rpc_task *task,
* The server thinks we tried to replay a request.
* Retry the call after bumping the sequence ID.
*/
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
goto retry_new_seq;
case -NFS4ERR_BADSLOT:
/*
@@ -801,21 +811,28 @@ static int nfs41_sequence_process(struct rpc_task *task,
goto session_recover;
goto retry_nowait;
case -NFS4ERR_SEQ_MISORDERED:
+ nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
/*
- * Was the last operation on this sequence interrupted?
- * If so, retry after bumping the sequence number.
- */
- if (interrupted)
- goto retry_new_seq;
- /*
- * Could this slot have been previously retired?
- * If so, then the server may be expecting seq_nr = 1!
+ * Were one or more calls using this slot interrupted?
+ * If the server never received the request, then our
+ * transmitted slot sequence number may be too high.
*/
- if (slot->seq_nr != 1) {
- slot->seq_nr = 1;
+ if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) {
+ slot->seq_nr--;
goto retry_nowait;
}
- goto session_recover;
+ /*
+ * RFC5661:
+ * A retry might be sent while the original request is
+ * still in progress on the replier. The replier SHOULD
+ * deal with the issue by returning NFS4ERR_DELAY as the
+ * reply to SEQUENCE or CB_SEQUENCE operation, but
+ * implementations MAY return NFS4ERR_SEQ_MISORDERED.
+ *
+ * Restart the search after a delay.
+ */
+ slot->seq_nr = slot->seq_nr_highest_sent;
+ goto out_retry;
default:
/* Just update the slot sequence no. */
slot->seq_done = 1;
@@ -906,17 +923,6 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
.rpc_call_done = nfs41_call_sync_done,
};
-static void
-nfs4_sequence_process_interrupted(struct nfs_client *client,
- struct nfs4_slot *slot, const struct cred *cred)
-{
- struct rpc_task *task;
-
- task = _nfs41_proc_sequence(client, cred, slot, true);
- if (!IS_ERR(task))
- rpc_put_task_async(task);
-}
-
#else /* !CONFIG_NFS_V4_1 */
static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -937,16 +943,15 @@ int nfs4_sequence_done(struct rpc_task *task,
}
EXPORT_SYMBOL_GPL(nfs4_sequence_done);
-static void
-nfs4_sequence_process_interrupted(struct nfs_client *client,
- struct nfs4_slot *slot, const struct cred *cred)
+#endif /* !CONFIG_NFS_V4_1 */
+
+static void nfs41_sequence_res_init(struct nfs4_sequence_res *res)
{
- WARN_ON_ONCE(1);
- slot->interrupted = 0;
+ res->sr_timestamp = jiffies;
+ res->sr_status_flags = 0;
+ res->sr_status = 1;
}
-#endif /* !CONFIG_NFS_V4_1 */
-
static
void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
@@ -958,10 +963,6 @@ void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
args->sa_slot = slot;
res->sr_slot = slot;
- res->sr_timestamp = jiffies;
- res->sr_status_flags = 0;
- res->sr_status = 1;
-
}
int nfs4_setup_sequence(struct nfs_client *client,
@@ -982,31 +983,25 @@ int nfs4_setup_sequence(struct nfs_client *client,
task->tk_timeout = 0;
}
- for (;;) {
- spin_lock(&tbl->slot_tbl_lock);
- /* The state manager will wait until the slot table is empty */
- if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
- goto out_sleep;
-
- slot = nfs4_alloc_slot(tbl);
- if (IS_ERR(slot)) {
- /* Try again in 1/4 second */
- if (slot == ERR_PTR(-ENOMEM))
- task->tk_timeout = HZ >> 2;
- goto out_sleep;
- }
- spin_unlock(&tbl->slot_tbl_lock);
+ spin_lock(&tbl->slot_tbl_lock);
+ /* The state manager will wait until the slot table is empty */
+ if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+ goto out_sleep;
- if (likely(!slot->interrupted))
- break;
- nfs4_sequence_process_interrupted(client,
- slot, task->tk_msg.rpc_cred);
+ slot = nfs4_alloc_slot(tbl);
+ if (IS_ERR(slot)) {
+ /* Try again in 1/4 second */
+ if (slot == ERR_PTR(-ENOMEM))
+ task->tk_timeout = HZ >> 2;
+ goto out_sleep;
}
+ spin_unlock(&tbl->slot_tbl_lock);
nfs4_sequence_attach_slot(args, res, slot);
trace_nfs4_setup_sequence(session, args);
out_start:
+ nfs41_sequence_res_init(res);
rpc_call_start(task);
return 0;
@@ -1555,6 +1550,10 @@ static void nfs_clear_open_stateid(struct nfs4_state *state,
static void nfs_set_open_stateid_locked(struct nfs4_state *state,
const nfs4_stateid *stateid, nfs4_stateid *freeme)
+ __must_hold(&state->owner->so_lock)
+ __must_hold(&state->seqlock)
+ __must_hold(RCU)
+
{
DEFINE_WAIT(wait);
int status = 0;
@@ -2934,7 +2933,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
}
out:
- nfs4_sequence_free_slot(&opendata->o_res.seq_res);
+ if (!opendata->cancelled)
+ nfs4_sequence_free_slot(&opendata->o_res.seq_res);
return ret;
}
@@ -5963,7 +5963,7 @@ out:
/**
* nfs4_proc_setclientid_confirm - Confirm client ID
* @clp: state data structure
- * @res: result of a previous SETCLIENTID
+ * @arg: result of a previous SETCLIENTID
* @cred: credential to use for this call
*
* Returns zero, a negative errno, or a negative NFS4ERR status code.
@@ -6302,7 +6302,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
p->arg.seqid = seqid;
p->res.seqid = seqid;
p->lsp = lsp;
- refcount_inc(&lsp->ls_count);
/* Ensure we don't close file until we're done freeing locks! */
p->ctx = get_nfs_open_context(ctx);
p->l_ctx = nfs_get_lock_context(ctx);
@@ -6527,7 +6526,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
p->res.lock_seqid = p->arg.lock_seqid;
p->lsp = lsp;
p->server = server;
- refcount_inc(&lsp->ls_count);
p->ctx = get_nfs_open_context(ctx);
locks_init_lock(&p->fl);
locks_copy_lock(&p->fl, fl);
@@ -7527,7 +7525,7 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred)
return status;
}
-/**
+/*
* If 'use_integrity' is true and the state managment nfs_client
* cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
* and the machine credential as per RFC3530bis and RFC5661 Security
@@ -8937,10 +8935,12 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
if (status != 0)
goto out;
- /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
- if (task->tk_status < 0 || lgp->res.layoutp->len == 0) {
+ if (task->tk_status < 0) {
status = nfs4_layoutget_handle_exception(task, lgp, &exception);
*timeout = exception.timeout;
+ } else if (lgp->res.layoutp->len == 0) {
+ status = -EAGAIN;
+ *timeout = nfs4_update_delay(&exception.timeout);
} else
lseg = pnfs_layout_process(lgp);
out:
@@ -9219,7 +9219,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
return status;
}
-/**
+/*
* Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
* possible) as per RFC3530bis and RFC5661 Security Considerations sections
*/
@@ -9484,7 +9484,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = {
* @server: server / transport on which to perform the operation
* @stateid: state ID to release
* @cred: credential
- * @is_recovery: set to true if this call needs to be privileged
+ * @privileged: set to true if this call needs to be privileged
*
* Note: this function is always asynchronous.
*/
@@ -9691,7 +9691,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK
| NFS_CAP_LAYOUTSTATS
- | NFS_CAP_CLONE,
+ | NFS_CAP_CLONE
+ | NFS_CAP_LAYOUTERROR,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index a5489d70a724..bcb532def9e2 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -55,7 +55,7 @@ static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize)
/**
* nfs4_slot_tbl_drain_complete - wake waiters when drain is complete
- * @tbl - controlling slot table
+ * @tbl: controlling slot table
*
*/
void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
@@ -110,6 +110,8 @@ static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl,
slot->table = tbl;
slot->slot_nr = slotid;
slot->seq_nr = seq_init;
+ slot->seq_nr_highest_sent = seq_init;
+ slot->seq_nr_last_acked = seq_init - 1;
}
return slot;
}
@@ -276,7 +278,8 @@ static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl,
p = &tbl->slots;
while (*p) {
(*p)->seq_nr = ivalue;
- (*p)->interrupted = 0;
+ (*p)->seq_nr_highest_sent = ivalue;
+ (*p)->seq_nr_last_acked = ivalue - 1;
p = &(*p)->next;
}
tbl->highest_used_slotid = NFS4_NO_SLOT;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 3c550f297561..b996ee23f1ba 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -10,7 +10,7 @@
/* maximum number of slots to use */
#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
-#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U)
+#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U)
#define NFS4_MAX_SLOT_TABLE (1024U)
#define NFS4_NO_SLOT ((u32)-1)
@@ -23,8 +23,9 @@ struct nfs4_slot {
unsigned long generation;
u32 slot_nr;
u32 seq_nr;
- unsigned int interrupted : 1,
- privileged : 1,
+ u32 seq_nr_last_acked;
+ u32 seq_nr_highest_sent;
+ unsigned int privileged : 1,
seq_done : 1;
};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 02488b50534a..3de36479ed7a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -563,6 +563,7 @@ static void nfs4_gc_state_owners(struct nfs_server *server)
* nfs4_get_state_owner - Look up a state owner given a credential
* @server: nfs_server to search
* @cred: RPC credential to match
+ * @gfp_flags: allocation mode
*
* Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
*/
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index b4557cf685fb..cd1a5c08da9a 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -524,6 +524,31 @@ TRACE_EVENT(nfs4_setup_sequence,
)
);
+TRACE_EVENT(nfs4_xdr_status,
+ TP_PROTO(
+ u32 op,
+ int error
+ ),
+
+ TP_ARGS(op, error),
+
+ TP_STRUCT__entry(
+ __field(u32, op)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->op = op;
+ __entry->error = -error;
+ ),
+
+ TP_printk(
+ "operation %d: nfs status %d (%s)",
+ __entry->op,
+ __entry->error, show_nfsv4_errors(__entry->error)
+ )
+);
+
DECLARE_EVENT_CLASS(nfs4_open_event,
TP_PROTO(
const struct nfs_open_context *ctx,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2fc8f6fa25e4..602446158bfb 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -54,6 +54,7 @@
#include <linux/nfs_fs.h>
#include "nfs4_fs.h"
+#include "nfs4trace.h"
#include "internal.h"
#include "nfs4idmap.h"
#include "nfs4session.h"
@@ -214,14 +215,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
nfs4_fattr_bitmap_maxsz)
#define encode_read_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 3)
-#define decode_read_maxsz (op_decode_hdr_maxsz + 2)
+#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + 1)
#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
2 + encode_verifier_maxsz + 5 + \
nfs4_label_maxsz)
#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
- decode_verifier_maxsz)
+ decode_verifier_maxsz + 1)
#define encode_readlink_maxsz (op_encode_hdr_maxsz)
-#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
+#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + 1)
#define encode_write_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 4)
#define decode_write_maxsz (op_decode_hdr_maxsz + \
@@ -283,14 +284,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
#define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
#define encode_getacl_maxsz (encode_getattr_maxsz)
#define decode_getacl_maxsz (op_decode_hdr_maxsz + \
- nfs4_fattr_bitmap_maxsz + 1)
+ nfs4_fattr_bitmap_maxsz + 1 + 1)
#define encode_setacl_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 3)
#define decode_setacl_maxsz (decode_setattr_maxsz)
#define encode_fs_locations_maxsz \
(encode_getattr_maxsz)
#define decode_fs_locations_maxsz \
- (0)
+ (1)
#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz)
#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))
@@ -391,12 +392,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
1 /* opaque devaddr4 length */ + \
/* devaddr4 payload is read into page */ \
1 /* notification bitmap length */ + \
- 1 /* notification bitmap, word 0 */)
+ 1 /* notification bitmap, word 0 */ + \
+ 1 /* possible XDR padding */)
#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
encode_stateid_maxsz)
#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
decode_stateid_maxsz + \
- XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
+ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1)
#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \
2 /* offset */ + \
2 /* length */ + \
@@ -1015,12 +1017,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
struct compound_hdr *hdr)
{
__be32 *p;
- struct rpc_auth *auth = req->rq_cred->cr_auth;
/* initialize running count of expected bytes in reply.
* NOTE: the replied tag SHOULD be the same is the one sent,
* but this is not required as a MUST for the server to do so. */
- hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
+ hdr->replen = 3 + hdr->taglen;
WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
encode_string(xdr, hdr->taglen, hdr->tag);
@@ -2340,9 +2341,9 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
if (args->lg_args) {
encode_layoutget(xdr, args->lg_args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->lg_args->layout.pages,
- 0, args->lg_args->layout.pglen);
+ rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+ args->lg_args->layout.pglen,
+ hdr.replen);
}
encode_nops(&hdr);
}
@@ -2386,9 +2387,9 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
if (args->lg_args) {
encode_layoutget(xdr, args->lg_args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->lg_args->layout.pages,
- 0, args->lg_args->layout.pglen);
+ rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+ args->lg_args->layout.pglen,
+ hdr.replen);
}
encode_nops(&hdr);
}
@@ -2498,8 +2499,8 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_readlink(xdr, args, req, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
- args->pgbase, args->pglen);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, hdr.replen);
encode_nops(&hdr);
}
@@ -2519,11 +2520,8 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_readdir(xdr, args, req, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
- args->pgbase, args->count);
- dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
- __func__, hdr.replen << 2, args->pages,
- args->pgbase, args->count);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen);
encode_nops(&hdr);
}
@@ -2543,8 +2541,8 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_read(xdr, args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->pages, args->pgbase, args->count);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen);
req->rq_rcv_buf.flags |= XDRBUF_READ;
encode_nops(&hdr);
}
@@ -2590,9 +2588,8 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getattr(xdr, nfs4_acl_bitmap, NULL,
ARRAY_SIZE(nfs4_acl_bitmap), &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
- args->acl_pages, 0, args->acl_len);
-
+ rpc_prepare_reply_pages(req, args->acl_pages, 0,
+ args->acl_len, replen + 1);
encode_nops(&hdr);
}
@@ -2813,9 +2810,8 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
encode_fs_locations(xdr, args->bitmask, &hdr);
}
- /* Set up reply kvec to capture returned fs_locations array. */
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
- (struct page **)&args->page, 0, PAGE_SIZE);
+ rpc_prepare_reply_pages(req, (struct page **)&args->page, 0,
+ PAGE_SIZE, replen + 1);
encode_nops(&hdr);
}
@@ -3017,10 +3013,8 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
/* set up reply kvec. Subtract notification bitmap max size (2)
* so that notification bitmap is put in xdr_buf tail */
- xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
- args->pdev->pages, args->pdev->pgbase,
- args->pdev->pglen);
-
+ rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase,
+ args->pdev->pglen, hdr.replen - 2);
encode_nops(&hdr);
}
@@ -3041,9 +3035,8 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
encode_putfh(xdr, NFS_FH(args->inode), &hdr);
encode_layoutget(xdr, args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->layout.pages, 0, args->layout.pglen);
-
+ rpc_prepare_reply_pages(req, args->layout.pages, 0,
+ args->layout.pglen, hdr.replen);
encode_nops(&hdr);
}
@@ -3144,22 +3137,12 @@ static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req,
}
#endif /* CONFIG_NFS_V4_1 */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("nfs: %s: prematurely hit end of receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
{
ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string,
NFS4_OPAQUE_LIMIT);
- if (unlikely(ret < 0)) {
- if (ret == -EBADMSG)
- print_overflow_msg(__func__, xdr);
+ if (unlikely(ret < 0))
return -EIO;
- }
*len = ret;
return 0;
}
@@ -3170,22 +3153,19 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
hdr->status = be32_to_cpup(p++);
hdr->taglen = be32_to_cpup(p);
p = xdr_inline_decode(xdr, hdr->taglen + 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
hdr->tag = (char *)p;
p += XDR_QUADLEN(hdr->taglen);
hdr->nops = be32_to_cpup(p);
if (unlikely(hdr->nops < 1))
return nfs4_stat_to_errno(hdr->status);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
@@ -3201,11 +3181,14 @@ static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
opnum = be32_to_cpup(p++);
if (unlikely(opnum != expected))
goto out_bad_operation;
+ if (unlikely(*p != cpu_to_be32(NFS_OK)))
+ goto out_status;
+ *nfs_retval = 0;
+ return true;
+out_status:
nfserr = be32_to_cpup(p);
- if (nfserr == NFS_OK)
- *nfs_retval = 0;
- else
- *nfs_retval = nfs4_stat_to_errno(nfserr);
+ trace_nfs4_xdr_status(opnum, nfserr);
+ *nfs_retval = nfs4_stat_to_errno(nfserr);
return true;
out_bad_operation:
dprintk("nfs: Server returned operation"
@@ -3214,7 +3197,6 @@ out_bad_operation:
*nfs_retval = -EREMOTEIO;
return false;
out_overflow:
- print_overflow_msg(__func__, xdr);
*nfs_retval = -EIO;
return false;
}
@@ -3235,10 +3217,9 @@ static int decode_ace(struct xdr_stream *xdr, void *ace)
char *str;
p = xdr_inline_decode(xdr, 12);
- if (likely(p))
- return decode_opaque_inline(xdr, &strlen, &str);
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (unlikely(!p))
+ return -EIO;
+ return decode_opaque_inline(xdr, &strlen, &str);
}
static ssize_t
@@ -3249,10 +3230,9 @@ decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz)
ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz);
if (likely(ret >= 0))
return ret;
- if (ret == -EMSGSIZE)
- return sz;
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (ret != -EMSGSIZE)
+ return -EIO;
+ return sz;
}
static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -3268,13 +3248,10 @@ static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigne
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*attrlen = be32_to_cpup(p);
*savep = xdr_stream_pos(xdr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -3303,7 +3280,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*type = be32_to_cpup(p);
if (*type < NF4REG || *type > NF4NAMEDATTR) {
dprintk("%s: bad type %d\n", __func__, *type);
@@ -3314,9 +3291,6 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
}
dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
@@ -3330,15 +3304,12 @@ static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*type = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE;
}
dprintk("%s: expire type=0x%x\n", __func__, *type);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -3352,7 +3323,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, change);
bitmap[0] &= ~FATTR4_WORD0_CHANGE;
ret = NFS_ATTR_FATTR_CHANGE;
@@ -3360,9 +3331,6 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
dprintk("%s: change attribute=%Lu\n", __func__,
(unsigned long long)*change);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -3376,16 +3344,13 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, size);
bitmap[0] &= ~FATTR4_WORD0_SIZE;
ret = NFS_ATTR_FATTR_SIZE;
}
dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3398,15 +3363,12 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
}
dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3419,15 +3381,12 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
}
dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -3442,7 +3401,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
p = xdr_inline_decode(xdr, 16);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &fsid->major);
xdr_decode_hyper(p, &fsid->minor);
bitmap[0] &= ~FATTR4_WORD0_FSID;
@@ -3452,9 +3411,6 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
(unsigned long long)fsid->major,
(unsigned long long)fsid->minor);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3467,15 +3423,12 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
}
dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res)
@@ -3487,14 +3440,11 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *
if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
*res = -be32_to_cpup(p);
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
@@ -3526,13 +3476,13 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru
if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len > NFS4_FHSIZE)
return -EIO;
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
if (fh != NULL) {
memcpy(fh->data, p, len);
fh->size = len;
@@ -3540,9 +3490,6 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru
bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3555,15 +3502,12 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
}
dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -3577,16 +3521,13 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, fileid);
bitmap[0] &= ~FATTR4_WORD0_FILEID;
ret = NFS_ATTR_FATTR_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -3600,16 +3541,13 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, fileid);
bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3623,15 +3561,12 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
}
dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3645,15 +3580,12 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
}
dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3667,15 +3599,12 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
}
dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -3686,7 +3615,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
n = be32_to_cpup(p);
if (n == 0)
goto root_path;
@@ -3718,9 +3647,6 @@ out_eio:
dprintk(" status %d", status);
status = -EIO;
goto out;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -3745,7 +3671,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
goto out;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ goto out_eio;
n = be32_to_cpup(p);
if (n <= 0)
goto out_eio;
@@ -3758,7 +3684,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
loc = &res->locations[res->nlocations];
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ goto out_eio;
m = be32_to_cpup(p);
dprintk("%s: servers:\n", __func__);
@@ -3796,8 +3722,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
out:
dprintk("%s: fs_locations done, error = %d\n", __func__, status);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
out_eio:
status = -EIO;
goto out;
@@ -3814,15 +3738,12 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
}
dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -3836,15 +3757,12 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*maxlink = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
}
dprintk("%s: maxlink=%u\n", __func__, *maxlink);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -3858,15 +3776,12 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*maxname = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
}
dprintk("%s: maxname=%u\n", __func__, *maxname);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3881,7 +3796,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
uint64_t maxread;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, &maxread);
if (maxread > 0x7FFFFFFF)
maxread = 0x7FFFFFFF;
@@ -3890,9 +3805,6 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
}
dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3907,7 +3819,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
uint64_t maxwrite;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, &maxwrite);
if (maxwrite > 0x7FFFFFFF)
maxwrite = 0x7FFFFFFF;
@@ -3916,9 +3828,6 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
}
dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -3933,7 +3842,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
tmp = be32_to_cpup(p);
*mode = tmp & ~S_IFMT;
bitmap[1] &= ~FATTR4_WORD1_MODE;
@@ -3941,9 +3850,6 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
}
dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3957,16 +3863,13 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*nlink = be32_to_cpup(p);
bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
ret = NFS_ATTR_FATTR_NLINK;
}
dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static ssize_t decode_nfs4_string(struct xdr_stream *xdr,
@@ -4011,10 +3914,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
return NFS_ATTR_FATTR_OWNER;
}
out:
- if (len != -EBADMSG)
- return 0;
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (len == -EBADMSG)
+ return -EIO;
+ return 0;
}
static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
@@ -4046,10 +3948,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
return NFS_ATTR_FATTR_GROUP;
}
out:
- if (len != -EBADMSG)
- return 0;
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (len == -EBADMSG)
+ return -EIO;
+ return 0;
}
static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -4066,7 +3967,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
major = be32_to_cpup(p++);
minor = be32_to_cpup(p);
tmp = MKDEV(major, minor);
@@ -4077,9 +3978,6 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
}
dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4093,15 +3991,12 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
}
dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4115,15 +4010,12 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
}
dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4137,15 +4029,12 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
}
dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -4159,7 +4048,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, used);
bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
ret = NFS_ATTR_FATTR_SPACE_USED;
@@ -4167,9 +4056,6 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
dprintk("%s: space used=%Lu\n", __func__,
(unsigned long long)*used);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static __be32 *
@@ -4189,12 +4075,9 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
p = xdr_inline_decode(xdr, nfstime4_maxsz << 2);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_nfstime4(p, time);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -4265,19 +4148,19 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
lfs = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
pi = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
if (len < NFS4_MAXLABELLEN) {
if (label) {
memcpy(label->label, p, len);
@@ -4295,10 +4178,6 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
(char *)label->label, label->len, label->pi, label->lfs);
return status;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -4342,14 +4221,11 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
p = xdr_inline_decode(xdr, 20);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
cinfo->atomic = be32_to_cpup(p++);
p = xdr_decode_hyper(p, &cinfo->before);
xdr_decode_hyper(p, &cinfo->after);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
@@ -4363,24 +4239,19 @@ static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
return status;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
supp = be32_to_cpup(p++);
acc = be32_to_cpup(p);
*supported = supp;
*access = acc;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
{
ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len);
- if (unlikely(ret < 0)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(ret < 0))
return -EIO;
- }
return 0;
}
@@ -4460,13 +4331,11 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
return status;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
bmlen = be32_to_cpup(p);
p = xdr_inline_decode(xdr, bmlen << 2);
if (likely(p))
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -4574,13 +4443,10 @@ static int decode_threshold_hint(struct xdr_stream *xdr,
if (likely(bitmap[0] & hint_bit)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_first_threshold_item4(struct xdr_stream *xdr,
@@ -4593,10 +4459,8 @@ static int decode_first_threshold_item4(struct xdr_stream *xdr,
/* layout type */
p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(!p))
return -EIO;
- }
res->l_type = be32_to_cpup(p);
/* thi_hintset bitmap */
@@ -4654,7 +4518,7 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
return -EREMOTEIO;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
num = be32_to_cpup(p);
if (num == 0)
return 0;
@@ -4667,9 +4531,6 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD;
}
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
@@ -4857,7 +4718,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
fsinfo->nlayouttypes = be32_to_cpup(p);
/* pNFS is not supported by the underlying file system */
@@ -4867,7 +4728,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr,
/* Decode and set first layout type, move xdr->p past unused types */
p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
/* If we get too many, then just cap it at the max */
if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) {
@@ -4879,9 +4740,6 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr,
for(i = 0; i < fsinfo->nlayouttypes; ++i)
fsinfo->layouttype[i] = be32_to_cpup(p++);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -4915,10 +4773,8 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
*res = 0;
if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(!p))
return -EIO;
- }
*res = be32_to_cpup(p);
bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
}
@@ -4937,10 +4793,8 @@ static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
*res = 0;
if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(!p))
return -EIO;
- }
*res = be32_to_cpup(p);
bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
}
@@ -5016,19 +4870,16 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len > NFS4_FHSIZE)
return -EIO;
fh->size = len;
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
memcpy(fh->data, p, len);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -5052,7 +4903,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
p = xdr_decode_hyper(p, &length);
type = be32_to_cpup(p++); /* 4 byte read */
@@ -5069,11 +4920,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */
p = xdr_inline_decode(xdr, namelen); /* variable size field */
- if (likely(p))
- return -NFS4ERR_DENIED;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (likely(!p))
+ return -EIO;
+ return -NFS4ERR_DENIED;
}
static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
@@ -5142,7 +4991,7 @@ static int decode_space_limit(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
limit_type = be32_to_cpup(p++);
switch (limit_type) {
case NFS4_LIMIT_SIZE:
@@ -5156,9 +5005,6 @@ static int decode_space_limit(struct xdr_stream *xdr,
maxsize >>= PAGE_SHIFT;
*pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_rw_delegation(struct xdr_stream *xdr,
@@ -5173,7 +5019,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
return status;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->do_recall = be32_to_cpup(p);
switch (delegation_type) {
@@ -5186,9 +5032,6 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
return -EIO;
}
return decode_ace(xdr, NULL);
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5198,7 +5041,7 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
why_no_delegation = be32_to_cpup(p);
switch (why_no_delegation) {
case WND4_CONTENTION:
@@ -5207,9 +5050,6 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
/* Ignore for now */
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5219,7 +5059,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
delegation_type = be32_to_cpup(p);
res->delegation_type = 0;
switch (delegation_type) {
@@ -5232,9 +5072,6 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
return decode_no_delegation(xdr, res);
}
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5256,7 +5093,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->rflags = be32_to_cpup(p++);
bmlen = be32_to_cpup(p);
if (bmlen > 10)
@@ -5264,7 +5101,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, bmlen << 2);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
for (i = 0; i < savewords; ++i)
res->attrset[i] = be32_to_cpup(p++);
@@ -5275,9 +5112,6 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
xdr_error:
dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
@@ -5326,7 +5160,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
return status;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
eof = be32_to_cpup(p++);
count = be32_to_cpup(p);
recvd = xdr_read_pages(xdr, count);
@@ -5339,9 +5173,6 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
res->eof = eof;
res->count = count;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -5374,7 +5205,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
/* Convert length of symlink */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len >= rcvbuf->page_len || len <= 0) {
dprintk("nfs: server returned giant symlink!\n");
@@ -5395,9 +5226,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
*/
xdr_terminate_string(rcvbuf, len);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -5500,7 +5328,6 @@ static int decode_setattr(struct xdr_stream *xdr)
return status;
if (decode_bitmap4(xdr, NULL, 0) >= 0)
return 0;
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -5512,7 +5339,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
opnum = be32_to_cpup(p++);
if (opnum != OP_SETCLIENTID) {
dprintk("nfs: decode_setclientid: Server returned operation"
@@ -5523,7 +5350,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re
if (nfserr == NFS_OK) {
p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->clientid);
memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
} else if (nfserr == NFSERR_CLID_INUSE) {
@@ -5532,28 +5359,25 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re
/* skip netid string */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
/* skip uaddr string */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
return -NFSERR_CLID_INUSE;
} else
return nfs4_stat_to_errno(nfserr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -5572,13 +5396,10 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res)
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->count = be32_to_cpup(p++);
res->verf->committed = be32_to_cpup(p++);
return decode_write_verifier(xdr, &res->verf->verifier);
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_delegreturn(struct xdr_stream *xdr)
@@ -5594,30 +5415,24 @@ static int decode_secinfo_gss(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
oid_len = be32_to_cpup(p);
if (oid_len > GSS_OID_MAX_LEN)
- goto out_err;
+ return -EINVAL;
p = xdr_inline_decode(xdr, oid_len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
memcpy(flavor->flavor_info.oid.data, p, oid_len);
flavor->flavor_info.oid.len = oid_len;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
flavor->flavor_info.qop = be32_to_cpup(p++);
flavor->flavor_info.service = be32_to_cpup(p);
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
-out_err:
- return -EINVAL;
}
static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
@@ -5629,7 +5444,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->flavors->num_flavors = 0;
num_flavors = be32_to_cpup(p);
@@ -5641,7 +5456,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
sec_flavor->flavor = be32_to_cpup(p);
if (sec_flavor->flavor == RPC_AUTH_GSS) {
@@ -5655,9 +5470,6 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res
status = 0;
out:
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
@@ -5711,11 +5523,11 @@ static int decode_exchange_id(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, &res->clientid);
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->seqid = be32_to_cpup(p++);
res->flags = be32_to_cpup(p++);
@@ -5739,7 +5551,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
/* server_owner4.so_minor_id */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->server_owner->minor_id);
/* server_owner4.so_major_id */
@@ -5759,7 +5571,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
/* Implementation Id */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
impl_id_count = be32_to_cpup(p++);
if (impl_id_count) {
@@ -5778,16 +5590,13 @@ static int decode_exchange_id(struct xdr_stream *xdr,
/* nii_date */
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->impl_id->date.seconds);
res->impl_id->date.nseconds = be32_to_cpup(p);
/* if there's more than one entry, ignore the rest */
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -5798,7 +5607,7 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 28);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
val = be32_to_cpup(p++); /* headerpadsz */
if (val)
return -EINVAL; /* no support for header padding yet */
@@ -5816,12 +5625,9 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
if (nr_attrs == 1) {
p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
@@ -5844,7 +5650,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
/* dir flags, rdma mode bool */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->dir = be32_to_cpup(p++);
if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH)
@@ -5855,9 +5661,6 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
res->use_conn_in_rdma_mode = true;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_create_session(struct xdr_stream *xdr,
@@ -5875,7 +5678,7 @@ static int decode_create_session(struct xdr_stream *xdr,
/* seqid, flags */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->seqid = be32_to_cpup(p++);
res->flags = be32_to_cpup(p);
@@ -5884,9 +5687,6 @@ static int decode_create_session(struct xdr_stream *xdr,
if (!status)
status = decode_chan_attrs(xdr, &res->bc_attrs);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -5967,7 +5767,6 @@ out_err:
res->sr_status = status;
return status;
out_overflow:
- print_overflow_msg(__func__, xdr);
status = -EIO;
goto out_err;
#else /* CONFIG_NFS_V4_1 */
@@ -5995,7 +5794,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
if (status == -ETOOSMALL) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
pdev->mincount = be32_to_cpup(p);
dprintk("%s: Min count too small. mincnt = %u\n",
__func__, pdev->mincount);
@@ -6005,7 +5804,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
type = be32_to_cpup(p++);
if (type != pdev->layout_type) {
dprintk("%s: layout mismatch req: %u pdev: %u\n",
@@ -6019,19 +5818,19 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
*/
pdev->mincount = be32_to_cpup(p);
if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount)
- goto out_overflow;
+ return -EIO;
/* Parse notification bitmap, verifying that it is zero. */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len) {
uint32_t i;
p = xdr_inline_decode(xdr, 4 * len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->notification = be32_to_cpup(p++);
for (i = 1; i < len; i++) {
@@ -6043,9 +5842,6 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
}
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
@@ -6115,7 +5911,6 @@ out:
res->status = status;
return status;
out_overflow:
- print_overflow_msg(__func__, xdr);
status = -EIO;
goto out;
}
@@ -6131,16 +5926,13 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
return status;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->lrs_present = be32_to_cpup(p);
if (res->lrs_present)
status = decode_layout_stateid(xdr, &res->stateid);
else
nfs4_stateid_copy(&res->stateid, &invalid_stateid);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_layoutcommit(struct xdr_stream *xdr,
@@ -6158,19 +5950,16 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
sizechanged = be32_to_cpup(p);
if (sizechanged) {
/* throw away new size */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_test_stateid(struct xdr_stream *xdr,
@@ -6186,21 +5975,17 @@ static int decode_test_stateid(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
num_res = be32_to_cpup(p++);
if (num_res != 1)
- goto out;
+ return -EIO;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->status = be32_to_cpup(p++);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
-out:
- return -EIO;
}
static int decode_free_stateid(struct xdr_stream *xdr,
@@ -7570,11 +7355,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
uint64_t new_cookie;
__be32 *p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
if (*p == xdr_zero) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
if (*p == xdr_zero)
return -EAGAIN;
entry->eof = 1;
@@ -7583,13 +7368,13 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
p = xdr_decode_hyper(p, &new_cookie);
entry->len = be32_to_cpup(p);
p = xdr_inline_decode(xdr, entry->len);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
entry->name = (const char *) p;
/*
@@ -7601,14 +7386,14 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
entry->fattr->valid = 0;
if (decode_attr_bitmap(xdr, bitmap) < 0)
- goto out_overflow;
+ return -EAGAIN;
if (decode_attr_length(xdr, &len, &savep) < 0)
- goto out_overflow;
+ return -EAGAIN;
if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
NULL, entry->label, entry->server) < 0)
- goto out_overflow;
+ return -EAGAIN;
if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
entry->ino = entry->fattr->mounted_on_fileid;
else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
@@ -7622,10 +7407,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
entry->cookie = new_cookie;
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EAGAIN;
}
/*
@@ -7791,6 +7572,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
PROC42(COPY, enc_copy, dec_copy),
PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel),
PROC(LOOKUPP, enc_lookupp, dec_lookupp),
+ PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror),
};
static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c
index b60d5fbd7727..a90b363500c2 100644
--- a/fs/nfs/nfstrace.c
+++ b/fs/nfs/nfstrace.c
@@ -11,3 +11,4 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index bd60f8d1e181..a0d6910aa03a 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -969,6 +969,91 @@ TRACE_EVENT(nfs_commit_done,
)
);
+TRACE_DEFINE_ENUM(NFS_OK);
+TRACE_DEFINE_ENUM(NFSERR_PERM);
+TRACE_DEFINE_ENUM(NFSERR_NOENT);
+TRACE_DEFINE_ENUM(NFSERR_IO);
+TRACE_DEFINE_ENUM(NFSERR_NXIO);
+TRACE_DEFINE_ENUM(NFSERR_ACCES);
+TRACE_DEFINE_ENUM(NFSERR_EXIST);
+TRACE_DEFINE_ENUM(NFSERR_XDEV);
+TRACE_DEFINE_ENUM(NFSERR_NODEV);
+TRACE_DEFINE_ENUM(NFSERR_NOTDIR);
+TRACE_DEFINE_ENUM(NFSERR_ISDIR);
+TRACE_DEFINE_ENUM(NFSERR_INVAL);
+TRACE_DEFINE_ENUM(NFSERR_FBIG);
+TRACE_DEFINE_ENUM(NFSERR_NOSPC);
+TRACE_DEFINE_ENUM(NFSERR_ROFS);
+TRACE_DEFINE_ENUM(NFSERR_MLINK);
+TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG);
+TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY);
+TRACE_DEFINE_ENUM(NFSERR_DQUOT);
+TRACE_DEFINE_ENUM(NFSERR_STALE);
+TRACE_DEFINE_ENUM(NFSERR_REMOTE);
+TRACE_DEFINE_ENUM(NFSERR_WFLUSH);
+TRACE_DEFINE_ENUM(NFSERR_BADHANDLE);
+TRACE_DEFINE_ENUM(NFSERR_NOT_SYNC);
+TRACE_DEFINE_ENUM(NFSERR_BAD_COOKIE);
+TRACE_DEFINE_ENUM(NFSERR_NOTSUPP);
+TRACE_DEFINE_ENUM(NFSERR_TOOSMALL);
+TRACE_DEFINE_ENUM(NFSERR_SERVERFAULT);
+TRACE_DEFINE_ENUM(NFSERR_BADTYPE);
+TRACE_DEFINE_ENUM(NFSERR_JUKEBOX);
+
+#define nfs_show_status(x) \
+ __print_symbolic(x, \
+ { NFS_OK, "OK" }, \
+ { NFSERR_PERM, "PERM" }, \
+ { NFSERR_NOENT, "NOENT" }, \
+ { NFSERR_IO, "IO" }, \
+ { NFSERR_NXIO, "NXIO" }, \
+ { NFSERR_ACCES, "ACCES" }, \
+ { NFSERR_EXIST, "EXIST" }, \
+ { NFSERR_XDEV, "XDEV" }, \
+ { NFSERR_NODEV, "NODEV" }, \
+ { NFSERR_NOTDIR, "NOTDIR" }, \
+ { NFSERR_ISDIR, "ISDIR" }, \
+ { NFSERR_INVAL, "INVAL" }, \
+ { NFSERR_FBIG, "FBIG" }, \
+ { NFSERR_NOSPC, "NOSPC" }, \
+ { NFSERR_ROFS, "ROFS" }, \
+ { NFSERR_MLINK, "MLINK" }, \
+ { NFSERR_NAMETOOLONG, "NAMETOOLONG" }, \
+ { NFSERR_NOTEMPTY, "NOTEMPTY" }, \
+ { NFSERR_DQUOT, "DQUOT" }, \
+ { NFSERR_STALE, "STALE" }, \
+ { NFSERR_REMOTE, "REMOTE" }, \
+ { NFSERR_WFLUSH, "WFLUSH" }, \
+ { NFSERR_BADHANDLE, "BADHANDLE" }, \
+ { NFSERR_NOT_SYNC, "NOTSYNC" }, \
+ { NFSERR_BAD_COOKIE, "BADCOOKIE" }, \
+ { NFSERR_NOTSUPP, "NOTSUPP" }, \
+ { NFSERR_TOOSMALL, "TOOSMALL" }, \
+ { NFSERR_SERVERFAULT, "REMOTEIO" }, \
+ { NFSERR_BADTYPE, "BADTYPE" }, \
+ { NFSERR_JUKEBOX, "JUKEBOX" })
+
+TRACE_EVENT(nfs_xdr_status,
+ TP_PROTO(
+ int error
+ ),
+
+ TP_ARGS(error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d (%s)",
+ __entry->error, nfs_show_status(__entry->error)
+ )
+);
+
#endif /* _TRACE_NFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e54d899c1848..e9f39fa5964b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -350,7 +350,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
/**
* nfs_unlock_request - Unlock request and wake up sleepers.
- * @req:
+ * @req: pointer to request
*/
void nfs_unlock_request(struct nfs_page *req)
{
@@ -368,7 +368,7 @@ void nfs_unlock_request(struct nfs_page *req)
/**
* nfs_unlock_and_release_request - Unlock request and release the nfs_page
- * @req:
+ * @req: pointer to request
*/
void nfs_unlock_and_release_request(struct nfs_page *req)
{
@@ -531,7 +531,6 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
* nfs_pgio_rpcsetup - Set up arguments for a pageio call
* @hdr: The pageio hdr
* @count: Number of bytes to read
- * @offset: Initial offset
* @how: How to commit data (writes only)
* @cinfo: Commit information for the call (writes only)
*/
@@ -634,7 +633,6 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
/**
* nfs_pgio_error - Clean up from a pageio error
- * @desc: IO descriptor
* @hdr: pageio header
*/
static void nfs_pgio_error(struct nfs_pgio_header *hdr)
@@ -768,8 +766,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
pageused = 0;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
+ nfs_list_move_request(req, &hdr->pages);
if (!last_page || last_page != req->wb_page) {
pageused++;
@@ -893,6 +890,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
* nfs_can_coalesce_requests - test two requests for compatibility
* @prev: pointer to nfs_page
* @req: pointer to nfs_page
+ * @pgio: pointer to nfs_pagio_descriptor
*
* The nfs_page structures 'prev' and 'req' are compared to ensure that the
* page data area they describe is contiguous, and that their RPC
@@ -961,8 +959,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
}
if (!nfs_can_coalesce_requests(prev, req, desc))
return 0;
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &mirror->pg_list);
+ nfs_list_move_request(req, &mirror->pg_list);
mirror->pg_count += req->wb_bytes;
return 1;
}
@@ -988,6 +985,16 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
}
}
+static void
+nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ LIST_HEAD(head);
+
+ nfs_list_move_request(req, &head);
+ desc->pg_completion_ops->error_cleanup(&head, desc->pg_error);
+}
+
/**
* nfs_pageio_add_request - Attempt to coalesce a request into a page list.
* @desc: destination io descriptor
@@ -1025,10 +1032,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
nfs_page_group_unlock(req);
desc->pg_moreio = 1;
nfs_pageio_doio(desc);
- if (desc->pg_error < 0)
- return 0;
- if (mirror->pg_recoalesce)
- return 0;
+ if (desc->pg_error < 0 || mirror->pg_recoalesce)
+ goto out_cleanup_subreq;
/* retry add_request for this subreq */
nfs_page_group_lock(req);
continue;
@@ -1061,6 +1066,10 @@ err_ptr:
desc->pg_error = PTR_ERR(subreq);
nfs_page_group_unlock(req);
return 0;
+out_cleanup_subreq:
+ if (req != subreq)
+ nfs_pageio_cleanup_request(desc, subreq);
+ return 0;
}
static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
@@ -1079,7 +1088,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
struct nfs_page *req;
req = list_first_entry(&head, struct nfs_page, wb_list);
- nfs_list_remove_request(req);
if (__nfs_pageio_add_request(desc, req))
continue;
if (desc->pg_error < 0) {
@@ -1120,7 +1128,8 @@ static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc)
for (midx = 0; midx < desc->pg_mirror_count; midx++) {
mirror = &desc->pg_mirrors[midx];
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+ desc->pg_completion_ops->error_cleanup(&mirror->pg_list,
+ desc->pg_error);
}
}
@@ -1168,11 +1177,14 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (nfs_pgio_has_mirroring(desc))
desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
- goto out_failed;
+ goto out_cleanup_subreq;
}
return 1;
+out_cleanup_subreq:
+ if (req != dupreq)
+ nfs_pageio_cleanup_request(desc, dupreq);
out_failed:
nfs_pageio_error_cleanup(desc);
return 0;
@@ -1194,7 +1206,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
desc->pg_mirror_idx = mirror_idx;
for (;;) {
nfs_pageio_doio(desc);
- if (!mirror->pg_recoalesce)
+ if (desc->pg_error < 0 || !mirror->pg_recoalesce)
break;
if (!nfs_do_recoalesce(desc))
break;
@@ -1222,9 +1234,8 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
while (!list_empty(&hdr->pages)) {
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
- nfs_list_remove_request(req);
if (!nfs_pageio_add_request(desc, req))
- nfs_list_add_request(req, &failed);
+ nfs_list_move_request(req, &failed);
}
nfs_pageio_complete(desc);
if (!list_empty(&failed)) {
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 53726da5c010..7066cd7c7aff 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -758,22 +758,35 @@ static int
pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
struct nfs_server *server,
struct list_head *layout_list)
+ __must_hold(&clp->cl_lock)
+ __must_hold(RCU)
{
struct pnfs_layout_hdr *lo, *next;
struct inode *inode;
list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
- if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
+ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+ test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) ||
+ !list_empty(&lo->plh_bulk_destroy))
continue;
+ /* If the sb is being destroyed, just bail */
+ if (!nfs_sb_active(server->super))
+ break;
inode = igrab(lo->plh_inode);
- if (inode == NULL)
- continue;
- list_del_init(&lo->plh_layouts);
- if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
- continue;
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
- iput(inode);
+ if (inode != NULL) {
+ list_del_init(&lo->plh_layouts);
+ if (pnfs_layout_add_bulk_destroy_list(inode,
+ layout_list))
+ continue;
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+ iput(inode);
+ } else {
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+ set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
+ }
+ nfs_sb_deactive(server->super);
spin_lock(&clp->cl_lock);
rcu_read_lock();
return -EAGAIN;
@@ -811,7 +824,7 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
/* Free all lsegs that are attached to commit buckets */
nfs_commit_inode(inode, 0);
pnfs_put_layout_hdr(lo);
- iput(inode);
+ nfs_iput_and_deactive(inode);
}
return ret;
}
@@ -1876,7 +1889,7 @@ lookup_again:
atomic_read(&lo->plh_outstanding) != 0) {
spin_unlock(&ino->i_lock);
lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
- atomic_read(&lo->plh_outstanding)));
+ !atomic_read(&lo->plh_outstanding)));
if (IS_ERR(lseg) || !list_empty(&lo->plh_segs))
goto out_put_layout_hdr;
pnfs_put_layout_hdr(lo);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 5e80a07b7bea..c0420b979d88 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -104,6 +104,7 @@ enum {
NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
+ NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
};
enum layoutdriver_policy_flags {
@@ -349,6 +350,7 @@ void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nf
void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
const struct nfs4_deviceid *);
bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node);
void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
void nfs4_deviceid_purge_client(const struct nfs_client *);
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 7fb59487ee90..537b80d693f1 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -284,10 +284,22 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
void
+nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node)
+{
+ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+ clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_available);
+
+void
nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node)
{
node->timestamp_unavailable = jiffies;
+ smp_mb__before_atomic();
set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
}
EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable);
@@ -302,6 +314,7 @@ nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node)
if (time_in_range(node->timestamp_unavailable, start, end))
return true;
clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
}
return false;
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f9f19784db82..1d95a60b2586 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -205,7 +205,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
}
static void
-nfs_async_read_error(struct list_head *head)
+nfs_async_read_error(struct list_head *head, int error)
{
struct nfs_page *req;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0570391eaa16..c27ac96a95bd 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1919,7 +1919,7 @@ static int nfs_parse_devname(const char *dev_name,
/* kill possible hostname list: not supported */
comma = strchr(dev_name, ',');
if (comma != NULL && comma < end)
- *comma = 0;
+ len = comma - dev_name;
}
if (len > maxnamlen)
@@ -2041,7 +2041,8 @@ static int nfs23_validate_mount_data(void *options,
memcpy(sap, &data->addr, sizeof(data->addr));
args->nfs_server.addrlen = sizeof(data->addr);
args->nfs_server.port = ntohs(data->addr.sin_port);
- if (!nfs_verify_server_address(sap))
+ if (sap->sa_family != AF_INET ||
+ !nfs_verify_server_address(sap))
goto out_no_address;
if (!(data->flags & NFS_MOUNT_TCP))
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 79b97b3c4427..52d533967485 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -39,6 +39,7 @@ nfs_free_unlinkdata(struct nfs_unlinkdata *data)
/**
* nfs_async_unlink_done - Sillydelete post-processing
* @task: rpc_task of the sillydelete
+ * @calldata: pointer to nfs_unlinkdata
*
* Do the directory attribute update.
*/
@@ -54,7 +55,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
/**
* nfs_async_unlink_release - Release the sillydelete data.
- * @task: rpc_task of the sillydelete
+ * @calldata: struct nfs_unlinkdata to release
*
* We need to call nfs_put_unlinkdata as a 'tk_release' task since the
* rpc_task would be freed too.
@@ -159,8 +160,8 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf
/**
* nfs_async_unlink - asynchronous unlinking of a file
- * @dir: parent directory of dentry
- * @dentry: dentry to unlink
+ * @dentry: parent directory of dentry
+ * @name: name of dentry to unlink
*/
static int
nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
@@ -324,6 +325,7 @@ static const struct rpc_call_ops nfs_rename_ops = {
* @new_dir: target directory for the rename
* @old_dentry: original dentry to be renamed
* @new_dentry: dentry to which the old_dentry should be renamed
+ * @complete: Function to run on successful completion
*
* It's expected that valid references to the dentries and inodes are held
*/
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d09c9f878141..f3ebabaa291d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -26,6 +26,7 @@
#include <linux/iversion.h>
#include <linux/uaccess.h>
+#include <linux/sched/mm.h>
#include "delegation.h"
#include "internal.h"
@@ -712,11 +713,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
struct nfs_pageio_descriptor pgio;
- struct nfs_io_completion *ioc = nfs_io_completion_alloc(GFP_NOFS);
+ struct nfs_io_completion *ioc;
+ unsigned int pflags = memalloc_nofs_save();
int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
+ ioc = nfs_io_completion_alloc(GFP_NOFS);
if (ioc)
nfs_io_completion_init(ioc, nfs_io_completion_commit, inode);
@@ -727,6 +730,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
nfs_pageio_complete(&pgio);
nfs_io_completion_put(ioc);
+ memalloc_nofs_restore(pflags);
+
if (err < 0)
goto out_err;
err = pgio.pg_error;
@@ -865,7 +870,6 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
/**
* nfs_request_add_commit_list - add request to a commit list
* @req: pointer to a struct nfs_page
- * @dst: commit list head
* @cinfo: holds list lock and accounting info
*
* This sets the PG_CLEAN bit, updates the cinfo count of
@@ -1412,20 +1416,27 @@ static void nfs_redirty_request(struct nfs_page *req)
nfs_release_request(req);
}
-static void nfs_async_write_error(struct list_head *head)
+static void nfs_async_write_error(struct list_head *head, int error)
{
struct nfs_page *req;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
+ if (nfs_error_is_fatal(error)) {
+ nfs_context_set_write_error(req->wb_context, error);
+ if (nfs_error_is_fatal_on_server(error)) {
+ nfs_write_error_remove_page(req);
+ continue;
+ }
+ }
nfs_redirty_request(req);
}
}
static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
{
- nfs_async_write_error(&hdr->pages);
+ nfs_async_write_error(&hdr->pages, 0);
filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset,
hdr->args.offset + hdr->args.count - 1);
}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9eb8086ea841..9bc32af4e2da 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -442,7 +442,9 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
struct nfsd3_readdirargs *argp = rqstp->rq_argp;
struct nfsd3_readdirres *resp = rqstp->rq_resp;
__be32 nfserr;
- int count;
+ int count = 0;
+ struct page **p;
+ caddr_t page_addr = NULL;
dprintk("nfsd: READDIR(3) %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
@@ -462,9 +464,31 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie,
&resp->common, nfs3svc_encode_entry);
memcpy(resp->verf, argp->verf, 8);
- resp->count = resp->buffer - argp->buffer;
- if (resp->offset)
- xdr_encode_hyper(resp->offset, argp->cookie);
+ count = 0;
+ for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
+ page_addr = page_address(*p);
+
+ if (((caddr_t)resp->buffer >= page_addr) &&
+ ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
+ count += (caddr_t)resp->buffer - page_addr;
+ break;
+ }
+ count += PAGE_SIZE;
+ }
+ resp->count = count >> 2;
+ if (resp->offset) {
+ loff_t offset = argp->cookie;
+
+ if (unlikely(resp->offset1)) {
+ /* we ended up with offset on a page boundary */
+ *resp->offset = htonl(offset >> 32);
+ *resp->offset1 = htonl(offset & 0xffffffff);
+ resp->offset1 = NULL;
+ } else {
+ xdr_encode_hyper(resp->offset, offset);
+ }
+ resp->offset = NULL;
+ }
RETURN_STATUS(nfserr);
}
@@ -533,6 +557,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
} else {
xdr_encode_hyper(resp->offset, offset);
}
+ resp->offset = NULL;
}
RETURN_STATUS(nfserr);
@@ -576,7 +601,7 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp)
resp->f_wtmax = max_blocksize;
resp->f_wtpref = max_blocksize;
resp->f_wtmult = PAGE_SIZE;
- resp->f_dtpref = PAGE_SIZE;
+ resp->f_dtpref = max_blocksize;
resp->f_maxfilesize = ~(u32) 0;
resp->f_properties = NFS3_FSF_DEFAULT;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9b973f4f7d01..8d789124ed3c 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -573,6 +573,9 @@ int
nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_readdirargs *args = rqstp->rq_argp;
+ int len;
+ u32 max_blocksize = svc_max_payload(rqstp);
+
p = decode_fh(p, &args->fh);
if (!p)
return 0;
@@ -580,8 +583,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
args->verf = p; p += 2;
args->dircount = ~0;
args->count = ntohl(*p++);
- args->count = min_t(u32, args->count, PAGE_SIZE);
- args->buffer = page_address(*(rqstp->rq_next_page++));
+ len = args->count = min_t(u32, args->count, max_blocksize);
+
+ while (len > 0) {
+ struct page *p = *(rqstp->rq_next_page++);
+ if (!args->buffer)
+ args->buffer = page_address(p);
+ len -= PAGE_SIZE;
+ }
return xdr_argsize_check(rqstp, p);
}
@@ -921,6 +930,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
} else {
xdr_encode_hyper(cd->offset, offset64);
}
+ cd->offset = NULL;
}
/*
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c74e4538d0eb..7caa3801ce72 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -60,16 +60,6 @@ struct nfs4_cb_compound_hdr {
int status;
};
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("NFS: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
static __be32 *xdr_encode_empty_array(__be32 *p)
{
*p++ = xdr_zero;
@@ -240,7 +230,6 @@ static int decode_cb_op_status(struct xdr_stream *xdr,
*status = nfs_cb_stat_to_errno(be32_to_cpup(p));
return 0;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
out_unexpected:
dprintk("NFSD: Callback server returned operation %d but "
@@ -309,7 +298,6 @@ static int decode_cb_compound4res(struct xdr_stream *xdr,
hdr->nops = be32_to_cpup(p);
return 0;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -437,7 +425,6 @@ out:
cb->cb_seq_status = status;
return status;
out_overflow:
- print_overflow_msg(__func__, xdr);
status = -EIO;
goto out;
}
@@ -913,9 +900,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
return PTR_ERR(client);
}
cred = get_backchannel_cred(clp, client, ses);
- if (IS_ERR(cred)) {
+ if (!cred) {
rpc_shutdown_client(client);
- return PTR_ERR(cred);
+ return -ENOMEM;
}
clp->cl_cb_client = client;
clp->cl_cb_cred = cred;
@@ -1023,8 +1010,9 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
cb->cb_seq_status = 1;
cb->cb_status = 0;
if (minorversion) {
- if (!nfsd41_cb_get_slot(clp, task))
+ if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task))
return;
+ cb->cb_holds_slot = true;
}
rpc_call_start(task);
}
@@ -1051,6 +1039,9 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
return true;
}
+ if (!cb->cb_holds_slot)
+ goto need_restart;
+
switch (cb->cb_seq_status) {
case 0:
/*
@@ -1089,6 +1080,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
cb->cb_seq_status);
}
+ cb->cb_holds_slot = false;
clear_bit(0, &clp->cl_cb_slot_busy);
rpc_wake_up_next(&clp->cl_cb_waitq);
dprintk("%s: freed slot, new seqid=%d\n", __func__,
@@ -1296,6 +1288,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_seq_status = 1;
cb->cb_status = 0;
cb->cb_need_restart = false;
+ cb->cb_holds_slot = false;
}
void nfsd4_run_cb(struct nfsd4_callback *cb)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5188f9f70c78..8c8563441208 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -126,7 +126,6 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
SHASH_DESC_ON_STACK(desc, tfm);
desc->tfm = tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
status = crypto_shash_digest(desc, clname->data, clname->len,
cksum.data);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fb3c9844c82a..f056b1d3fecd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -265,6 +265,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
static void
free_blocked_lock(struct nfsd4_blocked_lock *nbl)
{
+ locks_delete_block(&nbl->nbl_lock);
locks_release_private(&nbl->nbl_lock);
kfree(nbl);
}
@@ -293,11 +294,18 @@ remove_blocked_locks(struct nfs4_lockowner *lo)
nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock,
nbl_lru);
list_del_init(&nbl->nbl_lru);
- locks_delete_block(&nbl->nbl_lock);
free_blocked_lock(nbl);
}
}
+static void
+nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb)
+{
+ struct nfsd4_blocked_lock *nbl = container_of(cb,
+ struct nfsd4_blocked_lock, nbl_cb);
+ locks_delete_block(&nbl->nbl_lock);
+}
+
static int
nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
@@ -325,6 +333,7 @@ nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb)
}
static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
+ .prepare = nfsd4_cb_notify_lock_prepare,
.done = nfsd4_cb_notify_lock_done,
.release = nfsd4_cb_notify_lock_release,
};
@@ -1544,16 +1553,16 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
{
u32 slotsize = slot_bytes(ca);
u32 num = ca->maxreqs;
- int avail;
+ unsigned long avail, total_avail;
spin_lock(&nfsd_drc_lock);
- avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
- nfsd_drc_max_mem - nfsd_drc_mem_used);
+ total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
+ avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail);
/*
* Never use more than a third of the remaining memory,
* unless it's the only way to give this client a slot:
*/
- avail = clamp_t(int, avail, slotsize, avail/3);
+ avail = clamp_t(int, avail, slotsize, total_avail/3);
num = min_t(int, num, avail / slotsize);
nfsd_drc_mem_used += num * slotsize;
spin_unlock(&nfsd_drc_lock);
@@ -4863,7 +4872,6 @@ nfs4_laundromat(struct nfsd_net *nn)
nbl = list_first_entry(&reaplist,
struct nfsd4_blocked_lock, nbl_lru);
list_del_init(&nbl->nbl_lru);
- locks_delete_block(&nbl->nbl_lock);
free_blocked_lock(nbl);
}
out:
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 72a7681f4046..f2feb2d11bae 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1126,7 +1126,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
case 'Y':
case 'y':
case '1':
- if (nn->nfsd_serv)
+ if (!nn->nfsd_serv)
return -EBUSY;
nfsd4_end_grace(nn);
break;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 396c76755b03..9d6cb246c6c5 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,7 @@ struct nfsd4_callback {
int cb_seq_status;
int cb_status;
bool cb_need_restart;
+ bool cb_holds_slot;
};
struct nfsd4_callback_ops {
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index f2129a5d9f23..4391fd3abd8f 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -189,7 +189,7 @@ retry:
*/
if (!err)
return 0;
- else if (err != -EEXIST)
+ else if (err != -EBUSY)
goto failed_unlock;
err = invalidate_inode_pages2_range(btnc, newkey, newkey);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 6b9c27548997..63c6bb1f8c4d 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -346,10 +346,16 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
__kernel_fsid_t fsid = {};
fsnotify_foreach_obj_type(type) {
+ struct fsnotify_mark_connector *conn;
+
if (!fsnotify_iter_should_report_type(iter_info, type))
continue;
- fsid = iter_info->marks[type]->connector->fsid;
+ conn = READ_ONCE(iter_info->marks[type]->connector);
+ /* Mark is just getting destroyed or created? */
+ if (!conn)
+ continue;
+ fsid = conn->fsid;
if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
continue;
return fsid;
@@ -408,8 +414,12 @@ static int fanotify_handle_event(struct fsnotify_group *group,
return 0;
}
- if (FAN_GROUP_FLAG(group, FAN_REPORT_FID))
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
fsid = fanotify_get_fsid(iter_info);
+ /* Racing with mark destruction or creation? */
+ if (!fsid.val[0] && !fsid.val[1])
+ return 0;
+ }
event = fanotify_alloc_event(group, inode, mask, data, data_type,
&fsid);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 56992b32c6bb..a90bb19dcfa2 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -208,6 +208,7 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
{
struct fanotify_event_info_fid info = { };
struct file_handle handle = { };
+ unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh;
size_t fh_len = event->fh_len;
size_t len = fanotify_event_info_len(event);
@@ -233,7 +234,16 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
buf += sizeof(handle);
len -= sizeof(handle);
- if (copy_to_user(buf, fanotify_event_fh(event), fh_len))
+ /*
+ * For an inline fh, copy through stack to exclude the copy from
+ * usercopy hardening protections.
+ */
+ fh = fanotify_event_fh(event);
+ if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
+ memcpy(bounce, fh, fh_len);
+ fh = bounce;
+ }
+ if (copy_to_user(buf, fh, fh_len))
return -EFAULT;
/* Pad with 0's */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e2901fbb9f76..7b53598c8804 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -519,8 +519,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark)
return -ENOENT;
- else if (create)
- return -EEXIST;
+ else if (create) {
+ ret = -EEXIST;
+ goto out;
+ }
i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
@@ -548,6 +550,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
/* return the wd */
ret = i_mark->wd;
+out:
/* match the get from fsnotify_find_mark() */
fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d593d4269561..22acb0a79b53 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -239,13 +239,13 @@ static void fsnotify_drop_object(unsigned int type, void *objp)
void fsnotify_put_mark(struct fsnotify_mark *mark)
{
- struct fsnotify_mark_connector *conn;
+ struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector);
void *objp = NULL;
unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
bool free_conn = false;
/* Catch marks that were actually never attached to object */
- if (!mark->connector) {
+ if (!conn) {
if (refcount_dec_and_test(&mark->refcnt))
fsnotify_final_mark_destroy(mark);
return;
@@ -255,10 +255,9 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
* We have to be careful so that traversals of obj_list under lock can
* safely grab mark reference.
*/
- if (!refcount_dec_and_lock(&mark->refcnt, &mark->connector->lock))
+ if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock))
return;
- conn = mark->connector;
hlist_del_init_rcu(&mark->obj_list);
if (hlist_empty(&conn->list)) {
objp = fsnotify_detach_connector_from_object(conn, &type);
@@ -266,7 +265,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
} else {
__fsnotify_recalc_mask(conn);
}
- mark->connector = NULL;
+ WRITE_ONCE(mark->connector, NULL);
spin_unlock(&conn->lock);
fsnotify_drop_object(type, objp);
@@ -620,7 +619,7 @@ restart:
/* mark should be the last entry. last is the current last entry */
hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
- mark->connector = conn;
+ WRITE_ONCE(mark->connector, conn);
out_err:
spin_unlock(&conn->lock);
spin_unlock(&mark->lock);
@@ -808,6 +807,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
refcount_set(&mark->refcnt, 1);
fsnotify_get_group(group);
mark->group = group;
+ WRITE_ONCE(mark->connector, NULL);
}
/*
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index a35259eebc56..1dc9a08e8bdc 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4719,22 +4719,23 @@ out:
/* Lock an inode and grab a bh pointing to the inode. */
int ocfs2_reflink_inodes_lock(struct inode *s_inode,
- struct buffer_head **bh1,
+ struct buffer_head **bh_s,
struct inode *t_inode,
- struct buffer_head **bh2)
+ struct buffer_head **bh_t)
{
- struct inode *inode1;
- struct inode *inode2;
+ struct inode *inode1 = s_inode;
+ struct inode *inode2 = t_inode;
struct ocfs2_inode_info *oi1;
struct ocfs2_inode_info *oi2;
+ struct buffer_head *bh1 = NULL;
+ struct buffer_head *bh2 = NULL;
bool same_inode = (s_inode == t_inode);
+ bool need_swap = (inode1->i_ino > inode2->i_ino);
int status;
/* First grab the VFS and rw locks. */
lock_two_nondirectories(s_inode, t_inode);
- inode1 = s_inode;
- inode2 = t_inode;
- if (inode1->i_ino > inode2->i_ino)
+ if (need_swap)
swap(inode1, inode2);
status = ocfs2_rw_lock(inode1, 1);
@@ -4757,17 +4758,13 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode,
trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
(unsigned long long)oi2->ip_blkno);
- if (*bh1)
- *bh1 = NULL;
- if (*bh2)
- *bh2 = NULL;
-
/* We always want to lock the one with the lower lockid first. */
if (oi1->ip_blkno > oi2->ip_blkno)
mlog_errno(-ENOLCK);
/* lock id1 */
- status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+ status = ocfs2_inode_lock_nested(inode1, &bh1, 1,
+ OI_LS_REFLINK_TARGET);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -4776,15 +4773,25 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode,
/* lock id2 */
if (!same_inode) {
- status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+ status = ocfs2_inode_lock_nested(inode2, &bh2, 1,
OI_LS_REFLINK_TARGET);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto out_cl1;
}
- } else
- *bh2 = *bh1;
+ } else {
+ bh2 = bh1;
+ }
+
+ /*
+ * If we swapped inode order above, we have to swap the buffer heads
+ * before passing them back to the caller.
+ */
+ if (need_swap)
+ swap(bh1, bh2);
+ *bh_s = bh1;
+ *bh_t = bh2;
trace_ocfs2_double_lock_end(
(unsigned long long)oi1->ip_blkno,
@@ -4794,8 +4801,7 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode,
out_cl1:
ocfs2_inode_unlock(inode1, 1);
- brelse(*bh1);
- *bh1 = NULL;
+ brelse(bh1);
out_rw2:
ocfs2_rw_unlock(inode2, 1);
out_i2:
diff --git a/fs/open.c b/fs/open.c
index 0285ce7dbd51..a00350018a47 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -733,6 +733,12 @@ static int do_dentry_open(struct file *f,
return 0;
}
+ /* Any file opened for execve()/uselib() has to be a regular file. */
+ if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) {
+ error = -EACCES;
+ goto cleanup_file;
+ }
+
if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
@@ -1209,3 +1215,21 @@ int nonseekable_open(struct inode *inode, struct file *filp)
}
EXPORT_SYMBOL(nonseekable_open);
+
+/*
+ * stream_open is used by subsystems that want stream-like file descriptors.
+ * Such file descriptors are not seekable and don't have notion of position
+ * (file.f_pos is always 0). Contrary to file descriptors of other regular
+ * files, .read() and .write() can run simultaneously.
+ *
+ * stream_open never fails and is marked to return int so that it could be
+ * directly used as file_operations.open .
+ */
+int stream_open(struct inode *inode, struct file *filp)
+{
+ filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
+ filp->f_mode |= FMODE_STREAM;
+ return 0;
+}
+
+EXPORT_SYMBOL(stream_open);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index f038235c64bd..c3334eca18c7 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -261,11 +261,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
generic_fillattr(inode, stat);
/* override block size reported to stat */
- if (request_mask & STATX_SIZE)
- stat->result_mask = STATX_BASIC_STATS;
- else
- stat->result_mask = STATX_BASIC_STATS &
- ~STATX_SIZE;
+ if (!(request_mask & STATX_SIZE))
+ stat->result_mask &= ~STATX_SIZE;
stat->attributes_mask = STATX_ATTR_IMMUTABLE |
STATX_ATTR_APPEND;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 9e62dcf06fc4..68b3303e4b46 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -443,6 +443,24 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
{
int err;
+ /*
+ * Copy up data first and then xattrs. Writing data after
+ * xattrs will remove security.capability xattr automatically.
+ */
+ if (S_ISREG(c->stat.mode) && !c->metacopy) {
+ struct path upperpath, datapath;
+
+ ovl_path_upper(c->dentry, &upperpath);
+ if (WARN_ON(upperpath.dentry != NULL))
+ return -EIO;
+ upperpath.dentry = temp;
+
+ ovl_path_lowerdata(c->dentry, &datapath);
+ err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
+ if (err)
+ return err;
+ }
+
err = ovl_copy_xattr(c->lowerpath.dentry, temp);
if (err)
return err;
@@ -460,19 +478,6 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
return err;
}
- if (S_ISREG(c->stat.mode) && !c->metacopy) {
- struct path upperpath, datapath;
-
- ovl_path_upper(c->dentry, &upperpath);
- BUG_ON(upperpath.dentry != NULL);
- upperpath.dentry = temp;
-
- ovl_path_lowerdata(c->dentry, &datapath);
- err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
- if (err)
- return err;
- }
-
if (c->metacopy) {
err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY,
NULL, 0, -EOPNOTSUPP);
@@ -737,6 +742,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
{
struct path upperpath, datapath;
int err;
+ char *capability = NULL;
+ ssize_t uninitialized_var(cap_size);
ovl_path_upper(c->dentry, &upperpath);
if (WARN_ON(upperpath.dentry == NULL))
@@ -746,15 +753,37 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
if (WARN_ON(datapath.dentry == NULL))
return -EIO;
+ if (c->stat.size) {
+ err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS,
+ &capability, 0);
+ if (err < 0 && err != -ENODATA)
+ goto out;
+ }
+
err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
if (err)
- return err;
+ goto out_free;
+
+ /*
+ * Writing to upper file will clear security.capability xattr. We
+ * don't want that to happen for normal copy-up operation.
+ */
+ if (capability) {
+ err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS,
+ capability, cap_size, 0);
+ if (err)
+ goto out_free;
+ }
+
err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY);
if (err)
- return err;
+ goto out_free;
ovl_set_upperdata(d_inode(c->dentry));
+out_free:
+ kfree(capability);
+out:
return err;
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 5e45cb3630a0..9c6018287d57 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -277,6 +277,8 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
int ovl_check_metacopy_xattr(struct dentry *dentry);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
char *ovl_get_redirect_xattr(struct dentry *dentry, int padding);
+ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
+ size_t padding);
static inline bool ovl_is_impuredir(struct dentry *dentry)
{
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 7c01327b1852..4035e640f402 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -863,28 +863,49 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry)
return (oe->numlower > 1);
}
-char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
+ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
+ size_t padding)
{
- int res;
- char *s, *next, *buf = NULL;
+ ssize_t res;
+ char *buf = NULL;
- res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0);
+ res = vfs_getxattr(dentry, name, NULL, 0);
if (res < 0) {
if (res == -ENODATA || res == -EOPNOTSUPP)
- return NULL;
+ return -ENODATA;
goto fail;
}
- buf = kzalloc(res + padding + 1, GFP_KERNEL);
- if (!buf)
- return ERR_PTR(-ENOMEM);
+ if (res != 0) {
+ buf = kzalloc(res + padding, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
- if (res == 0)
- goto invalid;
+ res = vfs_getxattr(dentry, name, buf, res);
+ if (res < 0)
+ goto fail;
+ }
+ *value = buf;
+
+ return res;
+
+fail:
+ pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n",
+ name, res);
+ kfree(buf);
+ return res;
+}
- res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res);
+char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
+{
+ int res;
+ char *s, *next, *buf = NULL;
+
+ res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1);
+ if (res == -ENODATA)
+ return NULL;
if (res < 0)
- goto fail;
+ return ERR_PTR(res);
if (res == 0)
goto invalid;
@@ -900,15 +921,9 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
}
return buf;
-
-err_free:
- kfree(buf);
- return ERR_PTR(res);
-fail:
- pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res);
- goto err_free;
invalid:
pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf);
res = -EINVAL;
- goto err_free;
+ kfree(buf);
+ return ERR_PTR(res);
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 51d5fd8840ab..41065901106b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
* in the tee() system call, when we duplicate the buffers in one
* pipe into another.
*/
-void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
- get_page(buf->page);
+ return try_get_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_get);
@@ -225,8 +225,15 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
}
EXPORT_SYMBOL(generic_pipe_buf_release);
+/* New data written to a pipe may be appended to a buffer with this type. */
static const struct pipe_buf_operations anon_pipe_buf_ops = {
- .can_merge = 1,
+ .confirm = generic_pipe_buf_confirm,
+ .release = anon_pipe_buf_release,
+ .steal = anon_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
+};
+
+static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = {
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
.steal = anon_pipe_buf_steal,
@@ -234,13 +241,32 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
};
static const struct pipe_buf_operations packet_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
.steal = anon_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
+/**
+ * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable
+ * @buf: the buffer to mark
+ *
+ * Description:
+ * This function ensures that no future writes will be merged into the
+ * given &struct pipe_buffer. This is necessary when multiple pipe buffers
+ * share the same backing page.
+ */
+void pipe_buf_mark_unmergeable(struct pipe_buffer *buf)
+{
+ if (buf->ops == &anon_pipe_buf_ops)
+ buf->ops = &anon_pipe_buf_nomerge_ops;
+}
+
+static bool pipe_buf_can_merge(struct pipe_buffer *buf)
+{
+ return buf->ops == &anon_pipe_buf_ops;
+}
+
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
@@ -378,7 +404,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
struct pipe_buffer *buf = pipe->bufs + lastbuf;
int offset = buf->offset + buf->len;
- if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) {
+ if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;
diff --git a/fs/pnode.c b/fs/pnode.c
index 1100e810d855..7ea6cfb65077 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -214,7 +214,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
}
/* all accesses are serialized by namespace_sem */
-static struct user_namespace *user_ns;
static struct mount *last_dest, *first_source, *last_source, *dest_master;
static struct mountpoint *mp;
static struct hlist_head *list;
@@ -260,9 +259,6 @@ static int propagate_one(struct mount *m)
type |= CL_MAKE_SHARED;
}
- /* Notice when we are propagating across user namespaces */
- if (m->mnt_ns->user_ns != user_ns)
- type |= CL_UNPRIVILEGED;
child = copy_tree(last_source, last_source->mnt.mnt_root, type);
if (IS_ERR(child))
return PTR_ERR(child);
@@ -303,7 +299,6 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
* propagate_one(); everything is serialized by namespace_sem,
* so globals will do just fine.
*/
- user_ns = current->nsproxy->mnt_ns->user_ns;
last_dest = dest_mnt;
first_source = source_mnt;
last_source = source_mnt;
diff --git a/fs/pnode.h b/fs/pnode.h
index dc87e65becd2..3960a83666cf 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -27,8 +27,7 @@
#define CL_MAKE_SHARED 0x08
#define CL_PRIVATE 0x10
#define CL_SHARED_TO_SLAVE 0x20
-#define CL_UNPRIVILEGED 0x40
-#define CL_COPY_MNT_NS_FILE 0x80
+#define CL_COPY_MNT_NS_FILE 0x40
#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5ab1849971b4..f179568b4c76 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -59,6 +59,7 @@
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/namei.h>
@@ -92,7 +93,6 @@
#include <linux/sched/coredump.h>
#include <linux/sched/debug.h>
#include <linux/sched/stat.h>
-#include <linux/flex_array.h>
#include <linux/posix-timers.h>
#include <trace/events/oom.h>
#include "internal.h"
@@ -407,7 +407,6 @@ static void unlock_trace(struct task_struct *task)
static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- struct stack_trace trace;
unsigned long *entries;
int err;
@@ -430,20 +429,17 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
if (!entries)
return -ENOMEM;
- trace.nr_entries = 0;
- trace.max_entries = MAX_STACK_TRACE_DEPTH;
- trace.entries = entries;
- trace.skip = 0;
-
err = lock_trace(task);
if (!err) {
- unsigned int i;
+ unsigned int i, nr_entries;
- save_stack_trace_tsk(task, &trace);
+ nr_entries = stack_trace_save_tsk(task, entries,
+ MAX_STACK_TRACE_DEPTH, 0);
- for (i = 0; i < trace.nr_entries; i++) {
+ for (i = 0; i < nr_entries; i++) {
seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
}
+
unlock_trace(task);
}
kfree(entries);
@@ -489,10 +485,9 @@ static int lstats_show_proc(struct seq_file *m, void *v)
lr->count, lr->time, lr->max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long bt = lr->backtrace[q];
+
if (!bt)
break;
- if (bt == ULONG_MAX)
- break;
seq_printf(m, " %ps", (void *)bt);
}
seq_putc(m, '\n');
@@ -616,24 +611,25 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- long nr;
- unsigned long args[6], sp, pc;
+ struct syscall_info info;
+ u64 *args = &info.data.args[0];
int res;
res = lock_trace(task);
if (res)
return res;
- if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
+ if (task_current_syscall(task, &info))
seq_puts(m, "running\n");
- else if (nr < 0)
- seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+ else if (info.data.nr < 0)
+ seq_printf(m, "%d 0x%llx 0x%llx\n",
+ info.data.nr, info.sp, info.data.instruction_pointer);
else
seq_printf(m,
- "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
- nr,
+ "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
+ info.data.nr,
args[0], args[1], args[2], args[3], args[4], args[5],
- sp, pc);
+ info.sp, info.data.instruction_pointer);
unlock_trace(task);
return 0;
@@ -2142,11 +2138,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct task_struct *task;
struct mm_struct *mm;
unsigned long nr_files, pos, i;
- struct flex_array *fa = NULL;
- struct map_files_info info;
+ GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
+ genradix_init(&fa);
+
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2178,35 +2175,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
*/
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
- if (vma->vm_file && ++pos > ctx->pos)
- nr_files++;
- }
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= ctx->pos)
+ continue;
- if (nr_files) {
- fa = flex_array_alloc(sizeof(info), nr_files,
- GFP_KERNEL);
- if (!fa || flex_array_prealloc(fa, 0, nr_files,
- GFP_KERNEL)) {
+ p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
+ if (!p) {
ret = -ENOMEM;
- if (fa)
- flex_array_free(fa);
up_read(&mm->mmap_sem);
mmput(mm);
goto out_put_task;
}
- for (i = 0, vma = mm->mmap, pos = 2; vma;
- vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (++pos <= ctx->pos)
- continue;
- info.start = vma->vm_start;
- info.end = vma->vm_end;
- info.mode = vma->vm_file->f_mode;
- if (flex_array_put(fa, i++, &info, GFP_KERNEL))
- BUG();
- }
+ p->start = vma->vm_start;
+ p->end = vma->vm_end;
+ p->mode = vma->vm_file->f_mode;
}
up_read(&mm->mmap_sem);
mmput(mm);
@@ -2215,7 +2199,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
unsigned int len;
- p = flex_array_get(fa, i);
+ p = genradix_ptr(&fa, i);
len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
if (!proc_fill_cache(file, ctx,
buf, len,
@@ -2225,12 +2209,11 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
break;
ctx->pos++;
}
- if (fa)
- flex_array_free(fa);
out_put_task:
put_task_struct(task);
out:
+ genradix_free(&fa);
return ret;
}
@@ -2459,11 +2442,10 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry,
static struct dentry *proc_pident_lookup(struct inode *dir,
struct dentry *dentry,
- const struct pid_entry *ents,
- unsigned int nents)
+ const struct pid_entry *p,
+ const struct pid_entry *end)
{
struct task_struct *task = get_proc_task(dir);
- const struct pid_entry *p, *last;
struct dentry *res = ERR_PTR(-ENOENT);
if (!task)
@@ -2473,8 +2455,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
* Yes, it does not scale. And it should not. Don't add
* new entries into /proc/<tgid>/ without very good reasons.
*/
- last = &ents[nents];
- for (p = ents; p < last; p++) {
+ for (; p < end; p++) {
if (p->len != dentry->d_name.len)
continue;
if (!memcmp(dentry->d_name.name, p->name, p->len)) {
@@ -2610,7 +2591,7 @@ static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
{ \
return proc_pident_lookup(dir, dentry, \
LSM##_attr_dir_stuff, \
- ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+ LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
} \
\
static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
@@ -2655,7 +2636,8 @@ static struct dentry *proc_attr_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+ attr_dir_stuff,
+ attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
}
static const struct inode_operations proc_attr_dir_inode_operations = {
@@ -3088,10 +3070,20 @@ static const struct file_operations proc_tgid_base_operations = {
.llseek = generic_file_llseek,
};
+struct pid *tgid_pidfd_to_pid(const struct file *file)
+{
+ if (!d_is_dir(file->f_path.dentry) ||
+ (file->f_op != &proc_tgid_base_operations))
+ return ERR_PTR(-EBADF);
+
+ return proc_pid(file_inode(file));
+}
+
static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+ tgid_base_stuff,
+ tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
}
static const struct inode_operations proc_tgid_base_inode_operations = {
@@ -3463,7 +3455,8 @@ static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+ tid_base_stuff,
+ tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
}
static const struct file_operations proc_tid_base_operations = {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index da649ccd6804..fc7e38def174 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -24,7 +24,6 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/mount.h>
-#include <linux/magic.h>
#include <linux/uaccess.h>
@@ -122,13 +121,12 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
-static const struct super_operations proc_sops = {
+const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.destroy_inode = proc_destroy_inode,
.drop_inode = generic_delete_inode,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
- .remount_fs = proc_remount,
.show_options = proc_show_options,
};
@@ -488,51 +486,3 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
pde_put(de);
return inode;
}
-
-int proc_fill_super(struct super_block *s, void *data, int silent)
-{
- struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
- struct inode *root_inode;
- int ret;
-
- if (!proc_parse_options(data, ns))
- return -EINVAL;
-
- /* User space would break if executables or devices appear on proc */
- s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
- s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
- s->s_blocksize = 1024;
- s->s_blocksize_bits = 10;
- s->s_magic = PROC_SUPER_MAGIC;
- s->s_op = &proc_sops;
- s->s_time_gran = 1;
-
- /*
- * procfs isn't actually a stacking filesystem; however, there is
- * too much magic going on inside it to permit stacking things on
- * top of it
- */
- s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
-
- /* procfs dentries and inodes don't require IO to create */
- s->s_shrink.seeks = 0;
-
- pde_get(&proc_root);
- root_inode = proc_get_inode(s, &proc_root);
- if (!root_inode) {
- pr_err("proc_fill_super: get root inode failed\n");
- return -ENOMEM;
- }
-
- s->s_root = d_make_root(root_inode);
- if (!s->s_root) {
- pr_err("proc_fill_super: allocate dentry failed\n");
- return -ENOMEM;
- }
-
- ret = proc_setup_self(s);
- if (ret) {
- return ret;
- }
- return proc_setup_thread_self(s);
-}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index ea575375f210..d1671e97f7fe 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -207,13 +207,12 @@ struct pde_opener {
struct completion *c;
} __randomize_layout;
extern const struct inode_operations proc_link_inode_operations;
-
extern const struct inode_operations proc_pid_link_inode_operations;
+extern const struct super_operations proc_sops;
void proc_init_kmemcache(void);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-extern int proc_fill_super(struct super_block *, void *data, int flags);
extern void proc_entry_rundown(struct proc_dir_entry *);
/*
@@ -271,10 +270,8 @@ static inline void proc_tty_init(void) {}
* root.c
*/
extern struct proc_dir_entry proc_root;
-extern int proc_parse_options(char *options, struct pid_namespace *pid);
extern void proc_self_init(void);
-extern int proc_remount(struct super_block *, int *, char *);
/*
* task_[no]mmu.c
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index bbcc185062bb..f5834488b67d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -54,6 +54,28 @@ static LIST_HEAD(kclist_head);
static DECLARE_RWSEM(kclist_lock);
static int kcore_need_update = 1;
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * Same as oldmem_pfn_is_ram in vmcore
+ */
+static int (*mem_pfn_is_ram)(unsigned long pfn);
+
+int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+ if (mem_pfn_is_ram)
+ return -EBUSY;
+ mem_pfn_is_ram = fn;
+ return 0;
+}
+
+static int pfn_is_ram(unsigned long pfn)
+{
+ if (mem_pfn_is_ram)
+ return mem_pfn_is_ram(pfn);
+ else
+ return 1;
+}
+
/* This doesn't grab kclist_lock, so it should only be used at init time. */
void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
int type)
@@ -465,6 +487,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
goto out;
}
m = NULL; /* skip the list anchor */
+ } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
@@ -588,7 +615,7 @@ static void __init proc_kcore_text_init(void)
/*
* MODULES_VADDR has no intersection with VMALLOC_ADDR.
*/
-struct kcore_list kcore_modules;
+static struct kcore_list kcore_modules;
static void __init add_modules_range(void)
{
if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 4d598a399bbf..7325baa8f9d4 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1626,8 +1626,11 @@ static void drop_sysctl_table(struct ctl_table_header *header)
if (--header->nreg)
return;
- put_links(header);
- start_unregistering(header);
+ if (parent) {
+ put_links(header);
+ start_unregistering(header);
+ }
+
if (!--header->count)
kfree_rcu(header, rcu);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 621e6ec322ca..8b145e7b9661 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -19,86 +19,178 @@
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/user_namespace.h>
+#include <linux/fs_context.h>
#include <linux/mount.h>
#include <linux/pid_namespace.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/cred.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
#include "internal.h"
-enum {
- Opt_gid, Opt_hidepid, Opt_err,
+struct proc_fs_context {
+ struct pid_namespace *pid_ns;
+ unsigned int mask;
+ int hidepid;
+ int gid;
};
-static const match_table_t tokens = {
- {Opt_hidepid, "hidepid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_err, NULL},
+enum proc_param {
+ Opt_gid,
+ Opt_hidepid,
};
-int proc_parse_options(char *options, struct pid_namespace *pid)
+static const struct fs_parameter_spec proc_param_specs[] = {
+ fsparam_u32("gid", Opt_gid),
+ fsparam_u32("hidepid", Opt_hidepid),
+ {}
+};
+
+static const struct fs_parameter_description proc_fs_parameters = {
+ .name = "proc",
+ .specs = proc_param_specs,
+};
+
+static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- args[0].to = args[0].from = NULL;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_gid:
- if (match_int(&args[0], &option))
- return 0;
- pid->pid_gid = make_kgid(current_user_ns(), option);
- break;
- case Opt_hidepid:
- if (match_int(&args[0], &option))
- return 0;
- if (option < HIDEPID_OFF ||
- option > HIDEPID_INVISIBLE) {
- pr_err("proc: hidepid value must be between 0 and 2.\n");
- return 0;
- }
- pid->hide_pid = option;
- break;
- default:
- pr_err("proc: unrecognized mount option \"%s\" "
- "or missing value\n", p);
- return 0;
- }
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, &proc_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_gid:
+ ctx->gid = result.uint_32;
+ break;
+
+ case Opt_hidepid:
+ ctx->hidepid = result.uint_32;
+ if (ctx->hidepid < HIDEPID_OFF ||
+ ctx->hidepid > HIDEPID_INVISIBLE)
+ return invalf(fc, "proc: hidepid value must be between 0 and 2.\n");
+ break;
+
+ default:
+ return -EINVAL;
}
- return 1;
+ ctx->mask |= 1 << opt;
+ return 0;
}
-int proc_remount(struct super_block *sb, int *flags, char *data)
+static void proc_apply_options(struct super_block *s,
+ struct fs_context *fc,
+ struct pid_namespace *pid_ns,
+ struct user_namespace *user_ns)
{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ if (ctx->mask & (1 << Opt_gid))
+ pid_ns->pid_gid = make_kgid(user_ns, ctx->gid);
+ if (ctx->mask & (1 << Opt_hidepid))
+ pid_ns->hide_pid = ctx->hidepid;
+}
+
+static int proc_fill_super(struct super_block *s, struct fs_context *fc)
+{
+ struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info);
+ struct inode *root_inode;
+ int ret;
+
+ proc_apply_options(s, fc, pid_ns, current_user_ns());
+
+ /* User space would break if executables or devices appear on proc */
+ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
+ s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
+ s->s_blocksize = 1024;
+ s->s_blocksize_bits = 10;
+ s->s_magic = PROC_SUPER_MAGIC;
+ s->s_op = &proc_sops;
+ s->s_time_gran = 1;
+
+ /*
+ * procfs isn't actually a stacking filesystem; however, there is
+ * too much magic going on inside it to permit stacking things on
+ * top of it
+ */
+ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+
+ /* procfs dentries and inodes don't require IO to create */
+ s->s_shrink.seeks = 0;
+
+ pde_get(&proc_root);
+ root_inode = proc_get_inode(s, &proc_root);
+ if (!root_inode) {
+ pr_err("proc_fill_super: get root inode failed\n");
+ return -ENOMEM;
+ }
+
+ s->s_root = d_make_root(root_inode);
+ if (!s->s_root) {
+ pr_err("proc_fill_super: allocate dentry failed\n");
+ return -ENOMEM;
+ }
+
+ ret = proc_setup_self(s);
+ if (ret) {
+ return ret;
+ }
+ return proc_setup_thread_self(s);
+}
+
+static int proc_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
struct pid_namespace *pid = sb->s_fs_info;
sync_filesystem(sb);
- return !proc_parse_options(data, pid);
+
+ proc_apply_options(sb, fc, pid, current_user_ns());
+ return 0;
}
-static struct dentry *proc_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int proc_get_tree(struct fs_context *fc)
{
- struct pid_namespace *ns;
+ struct proc_fs_context *ctx = fc->fs_private;
- if (flags & SB_KERNMOUNT) {
- ns = data;
- data = NULL;
- } else {
- ns = task_active_pid_ns(current);
- }
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ fc->s_fs_info = ctx->pid_ns;
+ return vfs_get_super(fc, vfs_get_keyed_super, proc_fill_super);
+}
- return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
+static void proc_fs_context_free(struct fs_context *fc)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ if (ctx->pid_ns)
+ put_pid_ns(ctx->pid_ns);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations proc_fs_context_ops = {
+ .free = proc_fs_context_free,
+ .parse_param = proc_parse_param,
+ .get_tree = proc_get_tree,
+ .reconfigure = proc_reconfigure,
+};
+
+static int proc_init_fs_context(struct fs_context *fc)
+{
+ struct proc_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
+ fc->fs_private = ctx;
+ fc->ops = &proc_fs_context_ops;
+ return 0;
}
static void proc_kill_sb(struct super_block *sb)
@@ -115,10 +207,11 @@ static void proc_kill_sb(struct super_block *sb)
}
static struct file_system_type proc_fs_type = {
- .name = "proc",
- .mount = proc_mount,
- .kill_sb = proc_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "proc",
+ .init_fs_context = proc_init_fs_context,
+ .parameters = &proc_fs_parameters,
+ .kill_sb = proc_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
void __init proc_root_init(void)
@@ -156,7 +249,7 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
{
if (!proc_pid_lookup(dentry, flags))
return NULL;
-
+
return proc_lookup(dir, dentry, flags);
}
@@ -209,9 +302,28 @@ struct proc_dir_entry proc_root = {
int pid_ns_prepare_proc(struct pid_namespace *ns)
{
+ struct proc_fs_context *ctx;
+ struct fs_context *fc;
struct vfsmount *mnt;
- mnt = kern_mount_data(&proc_fs_type, ns);
+ fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
+
+ if (fc->user_ns != ns->user_ns) {
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ns->user_ns);
+ }
+
+ ctx = fc->fs_private;
+ if (ctx->pid_ns != ns) {
+ put_pid_ns(ctx->pid_ns);
+ get_pid_ns(ns);
+ ctx->pid_ns = ns;
+ }
+
+ mnt = fc_mount(fc);
+ put_fs_context(fc);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index beccb0b1d57c..95ca1fe7283c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
- SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm);
+ SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
@@ -1143,6 +1143,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = -EINTR;
goto out_mm;
}
+ /*
+ * Avoid to modify vma->vm_flags
+ * without locked ops while the
+ * coredump reads the vm_flags.
+ */
+ if (!mmget_still_valid(mm)) {
+ /*
+ * Silently return "count"
+ * like if get_task_mm()
+ * failed. FIXME: should this
+ * function have returned
+ * -ESRCH if get_task_mm()
+ * failed like if
+ * get_proc_task() fails?
+ */
+ up_write(&mm->mmap_sem);
+ goto out_mm;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma);
diff --git a/fs/read_write.c b/fs/read_write.c
index 30df848b7451..61b43ad7608e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -478,8 +478,8 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
return ret;
}
-ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
- loff_t *pos)
+static ssize_t __vfs_write(struct file *file, const char __user *p,
+ size_t count, loff_t *pos)
{
if (file->f_op->write)
return file->f_op->write(file, p, count, pos);
@@ -560,12 +560,13 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
static inline loff_t file_pos_read(struct file *file)
{
- return file->f_pos;
+ return file->f_mode & FMODE_STREAM ? 0 : file->f_pos;
}
static inline void file_pos_write(struct file *file, loff_t pos)
{
- file->f_pos = pos;
+ if ((file->f_mode & FMODE_STREAM) == 0)
+ file->f_pos = pos;
}
ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
@@ -1238,6 +1239,9 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
const struct compat_iovec __user *,vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
+ if (pos == -1)
+ return do_compat_readv(fd, vec, vlen, flags);
+
return do_compat_preadv64(fd, vec, vlen, pos, flags);
}
#endif
@@ -1344,6 +1348,9 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
const struct compat_iovec __user *,vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
+ if (pos == -1)
+ return do_compat_writev(fd, vec, vlen, flags);
+
return do_compat_pwritev64(fd, vec, vlen, pos, flags);
}
#endif
diff --git a/fs/splice.c b/fs/splice.c
index 6489fb9436e4..25212dcca2df 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -138,7 +138,6 @@ error:
}
const struct pipe_buf_operations page_cache_pipe_buf_ops = {
- .can_merge = 0,
.confirm = page_cache_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = page_cache_pipe_buf_steal,
@@ -156,7 +155,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
}
static const struct pipe_buf_operations user_page_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = user_page_pipe_buf_steal,
@@ -326,22 +324,20 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
EXPORT_SYMBOL(generic_file_splice_read);
const struct pipe_buf_operations default_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
-static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
+int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
{
return 1;
}
/* Pipe buffer operations for a socket and similar. */
const struct pipe_buf_operations nosteal_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_nosteal,
@@ -1597,7 +1593,11 @@ retry:
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
- pipe_buf_get(ipipe, ibuf);
+ if (!pipe_buf_get(ipipe, ibuf)) {
+ if (ret == 0)
+ ret = -EFAULT;
+ break;
+ }
*obuf = *ibuf;
/*
@@ -1606,6 +1606,8 @@ retry:
*/
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+ pipe_buf_mark_unmergeable(obuf);
+
obuf->len = len;
opipe->nrbufs++;
ibuf->offset += obuf->len;
@@ -1669,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
- pipe_buf_get(ipipe, ibuf);
+ if (!pipe_buf_get(ipipe, ibuf)) {
+ if (ret == 0)
+ ret = -EFAULT;
+ break;
+ }
obuf = opipe->bufs + nbuf;
*obuf = *ibuf;
@@ -1680,6 +1686,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
*/
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+ pipe_buf_mark_unmergeable(obuf);
+
if (obuf->len > len)
obuf->len = len;
diff --git a/fs/stat.c b/fs/stat.c
index adbfcd86c81b..c38e4c2e1221 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -45,11 +45,6 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
stat->ctime = inode->i_ctime;
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
-
- if (IS_NOATIME(inode))
- stat->result_mask &= ~STATX_ATIME;
- if (IS_AUTOMOUNT(inode))
- stat->attributes |= STATX_ATTR_AUTOMOUNT;
}
EXPORT_SYMBOL(generic_fillattr);
@@ -75,6 +70,13 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
stat->result_mask |= STATX_BASIC_STATS;
request_mask &= STATX_ALL;
query_flags &= KSTAT_QUERY_FLAGS;
+
+ /* allow the fs to override these if it really wants to */
+ if (IS_NOATIME(inode))
+ stat->result_mask &= ~STATX_ATIME;
+ if (IS_AUTOMOUNT(inode))
+ stat->attributes |= STATX_ATTR_AUTOMOUNT;
+
if (inode->i_op->getattr)
return inode->i_op->getattr(path, stat, request_mask,
query_flags);
diff --git a/fs/super.c b/fs/super.c
index 48e25eba8465..2739f57515f8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -35,6 +35,7 @@
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
#include <linux/user_namespace.h>
+#include <linux/fs_context.h>
#include <uapi/linux/mount.h>
#include "internal.h"
@@ -476,6 +477,94 @@ void generic_shutdown_super(struct super_block *sb)
EXPORT_SYMBOL(generic_shutdown_super);
/**
+ * sget_fc - Find or create a superblock
+ * @fc: Filesystem context.
+ * @test: Comparison callback
+ * @set: Setup callback
+ *
+ * Find or create a superblock using the parameters stored in the filesystem
+ * context and the two callback functions.
+ *
+ * If an extant superblock is matched, then that will be returned with an
+ * elevated reference count that the caller must transfer or discard.
+ *
+ * If no match is made, a new superblock will be allocated and basic
+ * initialisation will be performed (s_type, s_fs_info and s_id will be set and
+ * the set() callback will be invoked), the superblock will be published and it
+ * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE
+ * as yet unset.
+ */
+struct super_block *sget_fc(struct fs_context *fc,
+ int (*test)(struct super_block *, struct fs_context *),
+ int (*set)(struct super_block *, struct fs_context *))
+{
+ struct super_block *s = NULL;
+ struct super_block *old;
+ struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
+ int err;
+
+ if (!(fc->sb_flags & SB_KERNMOUNT) &&
+ fc->purpose != FS_CONTEXT_FOR_SUBMOUNT) {
+ /* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+ * over the namespace.
+ */
+ if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) {
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ } else {
+ if (!ns_capable(fc->user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ }
+ }
+
+retry:
+ spin_lock(&sb_lock);
+ if (test) {
+ hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
+ if (test(old, fc))
+ goto share_extant_sb;
+ }
+ }
+ if (!s) {
+ spin_unlock(&sb_lock);
+ s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
+ if (!s)
+ return ERR_PTR(-ENOMEM);
+ goto retry;
+ }
+
+ s->s_fs_info = fc->s_fs_info;
+ err = set(s, fc);
+ if (err) {
+ s->s_fs_info = NULL;
+ spin_unlock(&sb_lock);
+ destroy_unused_super(s);
+ return ERR_PTR(err);
+ }
+ fc->s_fs_info = NULL;
+ s->s_type = fc->fs_type;
+ strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
+ list_add_tail(&s->s_list, &super_blocks);
+ hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
+ spin_unlock(&sb_lock);
+ get_filesystem(s->s_type);
+ register_shrinker_prepared(&s->s_shrink);
+ return s;
+
+share_extant_sb:
+ if (user_ns != old->s_user_ns) {
+ spin_unlock(&sb_lock);
+ destroy_unused_super(s);
+ return ERR_PTR(-EBUSY);
+ }
+ if (!grab_super(old))
+ goto retry;
+ destroy_unused_super(s);
+ return old;
+}
+EXPORT_SYMBOL(sget_fc);
+
+/**
* sget_userns - find or create a superblock
* @type: filesystem type superblock should belong to
* @test: comparison callback
@@ -835,28 +924,35 @@ rescan:
}
/**
- * do_remount_sb - asks filesystem to change mount options.
- * @sb: superblock in question
- * @sb_flags: revised superblock flags
- * @data: the rest of options
- * @force: whether or not to force the change
+ * reconfigure_super - asks filesystem to change superblock parameters
+ * @fc: The superblock and configuration
*
- * Alters the mount options of a mounted file system.
+ * Alters the configuration parameters of a live superblock.
*/
-int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
+int reconfigure_super(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
int retval;
- int remount_ro;
+ bool remount_ro = false;
+ bool force = fc->sb_flags & SB_FORCE;
+ if (fc->sb_flags_mask & ~MS_RMT_MASK)
+ return -EINVAL;
if (sb->s_writers.frozen != SB_UNFROZEN)
return -EBUSY;
+ retval = security_sb_remount(sb, fc->security);
+ if (retval)
+ return retval;
+
+ if (fc->sb_flags_mask & SB_RDONLY) {
#ifdef CONFIG_BLOCK
- if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
- return -EACCES;
+ if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
+ return -EACCES;
#endif
- remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+ remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+ }
if (remount_ro) {
if (!hlist_empty(&sb->s_pins)) {
@@ -867,13 +963,14 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
return 0;
if (sb->s_writers.frozen != SB_UNFROZEN)
return -EBUSY;
- remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+ remount_ro = !sb_rdonly(sb);
}
}
shrink_dcache_sb(sb);
- /* If we are remounting RDONLY and current sb is read/write,
- make sure there are no rw files opened */
+ /* If we are reconfiguring to RDONLY and current sb is read/write,
+ * make sure there are no files open for writing.
+ */
if (remount_ro) {
if (force) {
sb->s_readonly_remount = 1;
@@ -885,8 +982,8 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
}
}
- if (sb->s_op->remount_fs) {
- retval = sb->s_op->remount_fs(sb, &sb_flags, data);
+ if (fc->ops->reconfigure) {
+ retval = fc->ops->reconfigure(fc);
if (retval) {
if (!force)
goto cancel_readonly;
@@ -895,7 +992,9 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
sb->s_type->name, retval);
}
}
- sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK);
+
+ WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
+ (fc->sb_flags & fc->sb_flags_mask)));
/* Needs to be ordered wrt mnt_is_readonly() */
smp_wmb();
sb->s_readonly_remount = 0;
@@ -922,10 +1021,15 @@ static void do_emergency_remount_callback(struct super_block *sb)
down_write(&sb->s_umount);
if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
!sb_rdonly(sb)) {
- /*
- * What lock protects sb->s_flags??
- */
- do_remount_sb(sb, SB_RDONLY, NULL, 1);
+ struct fs_context *fc;
+
+ fc = fs_context_for_reconfigure(sb->s_root,
+ SB_RDONLY | SB_FORCE, SB_RDONLY);
+ if (!IS_ERR(fc)) {
+ if (parse_monolithic_mount_data(fc, NULL) == 0)
+ (void)reconfigure_super(fc);
+ put_fs_context(fc);
+ }
}
up_write(&sb->s_umount);
}
@@ -1087,6 +1191,89 @@ struct dentry *mount_ns(struct file_system_type *fs_type,
EXPORT_SYMBOL(mount_ns);
+int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
+{
+ return set_anon_super(sb, NULL);
+}
+EXPORT_SYMBOL(set_anon_super_fc);
+
+static int test_keyed_super(struct super_block *sb, struct fs_context *fc)
+{
+ return sb->s_fs_info == fc->s_fs_info;
+}
+
+static int test_single_super(struct super_block *s, struct fs_context *fc)
+{
+ return 1;
+}
+
+/**
+ * vfs_get_super - Get a superblock with a search key set in s_fs_info.
+ * @fc: The filesystem context holding the parameters
+ * @keying: How to distinguish superblocks
+ * @fill_super: Helper to initialise a new superblock
+ *
+ * Search for a superblock and create a new one if not found. The search
+ * criterion is controlled by @keying. If the search fails, a new superblock
+ * is created and @fill_super() is called to initialise it.
+ *
+ * @keying can take one of a number of values:
+ *
+ * (1) vfs_get_single_super - Only one superblock of this type may exist on the
+ * system. This is typically used for special system filesystems.
+ *
+ * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have
+ * distinct keys (where the key is in s_fs_info). Searching for the same
+ * key again will turn up the superblock for that key.
+ *
+ * (3) vfs_get_independent_super - Multiple superblocks may exist and are
+ * unkeyed. Each call will get a new superblock.
+ *
+ * A permissions check is made by sget_fc() unless we're getting a superblock
+ * for a kernel-internal mount or a submount.
+ */
+int vfs_get_super(struct fs_context *fc,
+ enum vfs_get_super_keying keying,
+ int (*fill_super)(struct super_block *sb,
+ struct fs_context *fc))
+{
+ int (*test)(struct super_block *, struct fs_context *);
+ struct super_block *sb;
+
+ switch (keying) {
+ case vfs_get_single_super:
+ test = test_single_super;
+ break;
+ case vfs_get_keyed_super:
+ test = test_keyed_super;
+ break;
+ case vfs_get_independent_super:
+ test = NULL;
+ break;
+ default:
+ BUG();
+ }
+
+ sb = sget_fc(fc, test, set_anon_super_fc);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ if (!sb->s_root) {
+ int err = fill_super(sb, fc);
+ if (err) {
+ deactivate_locked_super(sb);
+ return err;
+ }
+
+ sb->s_flags |= SB_ACTIVE;
+ }
+
+ BUG_ON(fc->root);
+ fc->root = dget(sb->s_root);
+ return 0;
+}
+EXPORT_SYMBOL(vfs_get_super);
+
#ifdef CONFIG_BLOCK
static int set_bdev_super(struct super_block *s, void *data)
{
@@ -1212,6 +1399,31 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
}
EXPORT_SYMBOL(mount_nodev);
+static int reconfigure_single(struct super_block *s,
+ int flags, void *data)
+{
+ struct fs_context *fc;
+ int ret;
+
+ /* The caller really need to be passing fc down into mount_single(),
+ * then a chunk of this can be removed. [Bollocks -- AV]
+ * Better yet, reconfiguration shouldn't happen, but rather the second
+ * mount should be rejected if the parameters are not compatible.
+ */
+ fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
+
+ ret = parse_monolithic_mount_data(fc, data);
+ if (ret < 0)
+ goto out;
+
+ ret = reconfigure_super(fc);
+out:
+ put_fs_context(fc);
+ return ret;
+}
+
static int compare_single(struct super_block *s, void *p)
{
return 1;
@@ -1229,41 +1441,59 @@ struct dentry *mount_single(struct file_system_type *fs_type,
return ERR_CAST(s);
if (!s->s_root) {
error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
- s->s_flags |= SB_ACTIVE;
+ if (!error)
+ s->s_flags |= SB_ACTIVE;
} else {
- do_remount_sb(s, flags, data, 0);
+ error = reconfigure_single(s, flags, data);
+ }
+ if (unlikely(error)) {
+ deactivate_locked_super(s);
+ return ERR_PTR(error);
}
return dget(s->s_root);
}
EXPORT_SYMBOL(mount_single);
-struct dentry *
-mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+/**
+ * vfs_get_tree - Get the mountable root
+ * @fc: The superblock configuration context.
+ *
+ * The filesystem is invoked to get or create a superblock which can then later
+ * be used for mounting. The filesystem places a pointer to the root to be
+ * used for mounting in @fc->root.
+ */
+int vfs_get_tree(struct fs_context *fc)
{
- struct dentry *root;
struct super_block *sb;
- int error = -ENOMEM;
- void *sec_opts = NULL;
+ int error;
- if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
- error = security_sb_eat_lsm_opts(data, &sec_opts);
- if (error)
- return ERR_PTR(error);
- }
+ if (fc->root)
+ return -EBUSY;
- root = type->mount(type, flags, name, data);
- if (IS_ERR(root)) {
- error = PTR_ERR(root);
- goto out_free_secdata;
+ /* Get the mountable root in fc->root, with a ref on the root and a ref
+ * on the superblock.
+ */
+ error = fc->ops->get_tree(fc);
+ if (error < 0)
+ return error;
+
+ if (!fc->root) {
+ pr_err("Filesystem %s get_tree() didn't set fc->root\n",
+ fc->fs_type->name);
+ /* We don't know what the locking state of the superblock is -
+ * if there is a superblock.
+ */
+ BUG();
}
- sb = root->d_sb;
- BUG_ON(!sb);
+
+ sb = fc->root->d_sb;
WARN_ON(!sb->s_bdi);
+ if (fc->subtype && !sb->s_subtype) {
+ sb->s_subtype = fc->subtype;
+ fc->subtype = NULL;
+ }
+
/*
* Write barrier is for super_cache_count(). We place it before setting
* SB_BORN as the data dependency between the two functions is the
@@ -1273,14 +1503,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
smp_wmb();
sb->s_flags |= SB_BORN;
- error = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
- if (error)
- goto out_sb;
-
- if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT))) {
- error = security_sb_kern_mount(sb);
- if (error)
- goto out_sb;
+ error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
+ if (unlikely(error)) {
+ fc_drop_locked(fc);
+ return error;
}
/*
@@ -1290,18 +1516,11 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
* violate this rule.
*/
WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
- "negative value (%lld)\n", type->name, sb->s_maxbytes);
+ "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes);
- up_write(&sb->s_umount);
- security_free_mnt_opts(&sec_opts);
- return root;
-out_sb:
- dput(root);
- deactivate_locked_super(sb);
-out_free_secdata:
- security_free_mnt_opts(&sec_opts);
- return ERR_PTR(error);
+ return 0;
}
+EXPORT_SYMBOL(vfs_get_tree);
/*
* Setup private BDI for given superblock. It gets automatically cleaned up
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 92682fcc41f6..1b56686ab178 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -13,34 +13,71 @@
#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/fs_context.h>
+#include <net/net_namespace.h>
#include "sysfs.h"
static struct kernfs_root *sysfs_root;
struct kernfs_node *sysfs_root_kn;
-static struct dentry *sysfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int sysfs_get_tree(struct fs_context *fc)
{
- struct dentry *root;
- void *ns;
- bool new_sb = false;
+ struct kernfs_fs_context *kfc = fc->fs_private;
+ int ret;
- if (!(flags & SB_KERNMOUNT)) {
+ ret = kernfs_get_tree(fc);
+ if (ret)
+ return ret;
+
+ if (kfc->new_sb_created)
+ fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
+ return 0;
+}
+
+static void sysfs_fs_context_free(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ if (kfc->ns_tag)
+ kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag);
+ kernfs_free_fs_context(fc);
+ kfree(kfc);
+}
+
+static const struct fs_context_operations sysfs_fs_context_ops = {
+ .free = sysfs_fs_context_free,
+ .get_tree = sysfs_get_tree,
+};
+
+static int sysfs_init_fs_context(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc;
+ struct net *netns;
+
+ if (!(fc->sb_flags & SB_KERNMOUNT)) {
if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
- return ERR_PTR(-EPERM);
+ return -EPERM;
}
- ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
- root = kernfs_mount_ns(fs_type, flags, sysfs_root,
- SYSFS_MAGIC, &new_sb, ns);
- if (!new_sb)
- kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
- else if (!IS_ERR(root))
- root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
+ kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL);
+ if (!kfc)
+ return -ENOMEM;
- return root;
+ kfc->ns_tag = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+ kfc->root = sysfs_root;
+ kfc->magic = SYSFS_MAGIC;
+ fc->fs_private = kfc;
+ fc->ops = &sysfs_fs_context_ops;
+ if (netns) {
+ if (fc->user_ns)
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(netns->user_ns);
+ }
+ fc->global = true;
+ return 0;
}
static void sysfs_kill_sb(struct super_block *sb)
@@ -52,10 +89,10 @@ static void sysfs_kill_sb(struct super_block *sb)
}
static struct file_system_type sysfs_fs_type = {
- .name = "sysfs",
- .mount = sysfs_mount,
- .kill_sb = sysfs_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "sysfs",
+ .init_fs_context = sysfs_init_fs_context,
+ .kill_sb = sysfs_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
int __init sysfs_init(void)
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index bc1e082d921d..9da2f135121b 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -8,6 +8,7 @@ config UBIFS_FS
select CRYPTO_LZO if UBIFS_FS_LZO
select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
select CRYPTO_HASH_INFO
+ select UBIFS_FS_XATTR if FS_ENCRYPTION
depends on MTD_UBI
help
UBIFS is a file system for flash devices which works on top of UBI.
@@ -60,17 +61,6 @@ config UBIFS_FS_XATTR
If unsure, say Y.
-config UBIFS_FS_ENCRYPTION
- bool "UBIFS Encryption"
- depends on UBIFS_FS_XATTR && BLOCK
- select FS_ENCRYPTION
- default n
- help
- Enable encryption of UBIFS files and directories. This
- feature is similar to ecryptfs, but it is more memory
- efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
-
config UBIFS_FS_SECURITY
bool "UBIFS Security Labels"
depends on UBIFS_FS_XATTR
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 5f838319c8d5..5c4b845754a7 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -6,6 +6,6 @@ ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o
ubifs-y += misc.o
-ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o
+ubifs-$(CONFIG_FS_ENCRYPTION) += crypto.o
ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 5bf5fd08879e..b758004085c4 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -33,7 +33,6 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node,
int err;
shash->tfm = c->hash_tfm;
- shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
err = crypto_shash_digest(shash, node, le32_to_cpu(ch->len), hash);
if (err < 0)
@@ -56,7 +55,6 @@ static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash,
int err;
shash->tfm = c->hmac_tfm;
- shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
err = crypto_shash_digest(shash, hash, c->hash_len, hmac);
if (err < 0)
@@ -88,7 +86,6 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node,
return -ENOMEM;
hash_desc->tfm = c->hash_tfm;
- hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
ubifs_shash_copy_state(c, inhash, hash_desc);
err = crypto_shash_final(hash_desc, hash);
@@ -123,7 +120,6 @@ static struct shash_desc *ubifs_get_desc(const struct ubifs_info *c,
return ERR_PTR(-ENOMEM);
desc->tfm = tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
err = crypto_shash_init(desc);
if (err) {
@@ -364,7 +360,6 @@ static int ubifs_node_calc_hmac(const struct ubifs_info *c, const void *node,
ubifs_assert(c, ofs_hmac + hmac_len < len);
shash->tfm = c->hmac_tfm;
- shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
err = crypto_shash_init(shash);
if (err)
@@ -483,7 +478,6 @@ int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac)
return 0;
shash->tfm = c->hmac_tfm;
- shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
err = crypto_shash_init(shash);
if (err)
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 0164bcc827f8..82e4e6a30b04 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -28,6 +28,11 @@
#include <linux/mount.h>
#include "ubifs.h"
+/* Need to be kept consistent with checked flags in ioctl2ubifs() */
+#define UBIFS_SUPPORTED_IOCTL_FLAGS \
+ (FS_COMPR_FL | FS_SYNC_FL | FS_APPEND_FL | \
+ FS_IMMUTABLE_FL | FS_DIRSYNC_FL)
+
/**
* ubifs_set_inode_flags - set VFS inode flags.
* @inode: VFS inode to set flags for
@@ -169,6 +174,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
if (get_user(flags, (int __user *) arg))
return -EFAULT;
+ if (flags & ~UBIFS_SUPPORTED_IOCTL_FLAGS)
+ return -EOPNOTSUPP;
+
if (!S_ISDIR(inode->i_mode))
flags &= ~FS_DIRSYNC_FL;
@@ -185,7 +193,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return err;
}
case FS_IOC_SET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
struct ubifs_info *c = inode->i_sb->s_fs_info;
err = ubifs_enable_encryption(c);
@@ -198,7 +206,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
#endif
}
case FS_IOC_GET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
return fscrypt_ioctl_get_policy(file, (void __user *)arg);
#else
return -EOPNOTSUPP;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 0a0e65c07c6d..5c8a81a019a4 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -576,7 +576,6 @@ static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_h
SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm);
hash_desc->tfm = c->hash_tfm;
- hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
ubifs_shash_copy_state(c, log_hash, hash_desc);
return crypto_shash_final(hash_desc, hash);
@@ -587,7 +586,6 @@ static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac)
SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm);
hmac_desc->tfm = c->hmac_tfm;
- hmac_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac);
}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 3da90c951c23..67fac1e8adfb 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -748,7 +748,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
goto out;
}
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#ifndef CONFIG_FS_ENCRYPTION
if (c->encrypted) {
ubifs_err(c, "file system contains encrypted files but UBIFS"
" was built without crypto support.");
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1fac1133dadd..12628184772c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -276,14 +276,12 @@ static void ubifs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
struct ubifs_inode *ui = ubifs_inode(inode);
+ kfree(ui->data);
kmem_cache_free(ubifs_inode_slab, ui);
}
static void ubifs_destroy_inode(struct inode *inode)
{
- struct ubifs_inode *ui = ubifs_inode(inode);
-
- kfree(ui->data);
call_rcu(&inode->i_rcu, ubifs_i_callback);
}
@@ -2146,7 +2144,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_UBIFS_FS_XATTR
sb->s_xattr = ubifs_xattr_handlers;
#endif
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &ubifs_crypt_operations;
#endif
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 38401adaa00d..1ae12900e01d 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -43,7 +43,6 @@
#include <crypto/hash.h>
#include <crypto/algapi.h>
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION)
#include <linux/fscrypt.h>
#include "ubifs-media.h"
@@ -142,7 +141,7 @@
*/
#define WORST_COMPR_FACTOR 2
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE
#else
#define UBIFS_CIPHER_BLOCK_SIZE 0
@@ -2072,7 +2071,7 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
#include "misc.h"
#include "key.h"
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#ifndef CONFIG_FS_ENCRYPTION
static inline int ubifs_encrypt(const struct inode *inode,
struct ubifs_data_node *dn,
unsigned int in_len, unsigned int *out_len,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index ae796e10f68b..e7276932e433 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1242,8 +1242,10 @@ set_size:
truncate_setsize(inode, newsize);
down_write(&iinfo->i_data_sem);
udf_clear_extent_cache(inode);
- udf_truncate_extents(inode);
+ err = udf_truncate_extents(inode);
up_write(&iinfo->i_data_sem);
+ if (err)
+ return err;
}
update_time:
inode->i_mtime = inode->i_ctime = current_time(inode);
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index b647f0bd150c..63a47f1e1d52 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -199,7 +199,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
* for making file shorter. For making file longer, udf_extend_file() has to
* be used.
*/
-void udf_truncate_extents(struct inode *inode)
+int udf_truncate_extents(struct inode *inode)
{
struct extent_position epos;
struct kernel_lb_addr eloc, neloc = {};
@@ -224,7 +224,7 @@ void udf_truncate_extents(struct inode *inode)
if (etype == -1) {
/* We should extend the file? */
WARN_ON(byte_offset);
- return;
+ return 0;
}
epos.offset -= adsize;
extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
@@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode)
epos.block = eloc;
epos.bh = udf_tread(sb,
udf_get_lb_pblock(sb, &eloc, 0));
+ /* Error reading indirect block? */
+ if (!epos.bh)
+ return -EIO;
if (elen)
indirect_ext_len =
(elen + sb->s_blocksize - 1) >>
@@ -283,4 +286,5 @@ void udf_truncate_extents(struct inode *inode)
iinfo->i_lenExtents = inode->i_size;
brelse(epos.bh);
+ return 0;
}
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index ee246769dee4..d89ef71887fc 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -235,7 +235,7 @@ extern struct inode *udf_new_inode(struct inode *, umode_t);
/* truncate.c */
extern void udf_truncate_tail_extent(struct inode *);
extern void udf_discard_prealloc(struct inode *);
-extern void udf_truncate_extents(struct inode *);
+extern int udf_truncate_extents(struct inode *);
/* balloc.c */
extern void udf_free_blocks(struct super_block *, struct inode *,
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 1fd3011ea623..7fd4802222b8 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -229,7 +229,7 @@ ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode)
case UFS_UID_44BSD:
return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid);
case UFS_UID_EFT:
- if (inode->ui_u1.oldids.ui_suid == 0xFFFF)
+ if (inode->ui_u1.oldids.ui_sgid == 0xFFFF)
return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid);
/* Fall through */
default:
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 89800fc7dc9d..f5de1e726356 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -629,6 +629,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
/* the various vma->vm_userfaultfd_ctx still points to it */
down_write(&mm->mmap_sem);
+ /* no task can run (and in turn coredump) yet */
+ VM_WARN_ON(!mmget_still_valid(mm));
for (vma = mm->mmap; vma; vma = vma->vm_next)
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -883,6 +885,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* taking the mmap_sem for writing.
*/
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto skip_mm;
prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
@@ -905,6 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
+skip_mm:
up_write(&mm->mmap_sem);
mmput(mm);
wakeup:
@@ -1333,6 +1338,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
@@ -1520,6 +1527,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto out;
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 48502cb9990f..4637ae1ae91c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1191,7 +1191,10 @@ xfs_iread_extents(
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
*/
level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
+ if (unlikely(level == 0)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
@@ -4249,9 +4252,13 @@ xfs_bmapi_write(
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap) /* i/o: mval size/count */
{
+ struct xfs_bmalloca bma = {
+ .tp = tp,
+ .ip = ip,
+ .total = total,
+ };
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
- struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
xfs_fileoff_t end; /* end of mapped file region */
bool eof = false; /* after the end of extents */
int error; /* error return */
@@ -4319,10 +4326,6 @@ xfs_bmapi_write(
eof = true;
if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
bma.prev.br_startoff = NULLFILEOFF;
- bma.tp = tp;
- bma.ip = ip;
- bma.total = total;
- bma.datatype = 0;
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
n = 0;
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 9a3767818c50..9c2a0a13ed61 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -563,43 +563,40 @@ xfs_dir3_leaf_find_entry(
*/
int /* error */
xfs_dir2_leaf_addname(
- xfs_da_args_t *args) /* operation arguments */
+ struct xfs_da_args *args) /* operation arguments */
{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_trans *tp = args->trans;
__be16 *bestsp; /* freespace table in leaf */
- int compact; /* need to compact leaves */
- xfs_dir2_data_hdr_t *hdr; /* data block header */
+ __be16 *tagp; /* end of data entry */
struct xfs_buf *dbp; /* data block buffer */
- xfs_dir2_data_entry_t *dep; /* data block entry */
- xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_data_unused_t *dup; /* data unused entry */
+ struct xfs_buf *lbp; /* leaf's buffer */
+ struct xfs_dir2_leaf *leaf; /* leaf structure */
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
+ struct xfs_dir2_data_hdr *hdr; /* data block header */
+ struct xfs_dir2_data_entry *dep; /* data block entry */
+ struct xfs_dir2_leaf_entry *lep; /* leaf entry table pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir2_data_unused *dup; /* data unused entry */
+ struct xfs_dir2_leaf_tail *ltp; /* leaf tail pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ int compact; /* need to compact leaves */
int error; /* error return value */
int grown; /* allocated new data block */
- int highstale; /* index of next stale leaf */
+ int highstale = 0; /* index of next stale leaf */
int i; /* temporary, index */
int index; /* leaf table position */
- struct xfs_buf *lbp; /* leaf's buffer */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
int length; /* length of new entry */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
int lfloglow; /* low leaf logging index */
int lfloghigh; /* high leaf logging index */
- int lowstale; /* index of prev stale leaf */
- xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
+ int lowstale = 0; /* index of prev stale leaf */
int needbytes; /* leaf block bytes needed */
int needlog; /* need to log data header */
int needscan; /* need to rescan data free */
- __be16 *tagp; /* end of data entry */
- xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t use_block; /* data block number */
- struct xfs_dir2_data_free *bf; /* bestfree table */
- struct xfs_dir2_leaf_entry *ents;
- struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_addname(args);
- dp = args->dp;
- tp = args->trans;
-
error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 3b03703c5c3d..16731d2d684b 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -426,24 +426,22 @@ xfs_dir2_leaf_to_node(
static int /* error */
xfs_dir2_leafn_add(
struct xfs_buf *bp, /* leaf buffer */
- xfs_da_args_t *args, /* operation arguments */
+ struct xfs_da_args *args, /* operation arguments */
int index) /* insertion pt for new entry */
{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *lep;
+ struct xfs_dir2_leaf_entry *ents;
int compact; /* compacting stale leaves */
- xfs_inode_t *dp; /* incore directory inode */
- int highstale; /* next stale entry */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ int highstale = 0; /* next stale entry */
int lfloghigh; /* high leaf entry logging */
int lfloglow; /* low leaf entry logging */
- int lowstale; /* previous stale entry */
- struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_dir2_leaf_entry *ents;
+ int lowstale = 0; /* previous stale entry */
trace_xfs_dir2_leafn_add(args, index);
- dp = args->dp;
- leaf = bp->b_addr;
dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
ents = dp->d_ops->leaf_ents_p(leaf);
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 6f94d1f7322d..117910db51b8 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -415,8 +415,17 @@ xchk_btree_check_owner(
struct xfs_btree_cur *cur = bs->cur;
struct check_owner *co;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL)
+ /*
+ * In theory, xfs_btree_get_block should only give us a null buffer
+ * pointer for the root of a root-in-inode btree type, but we need
+ * to check defensively here in case the cursor state is also screwed
+ * up.
+ */
+ if (bp == NULL) {
+ if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
+ }
/*
* We want to cross-reference each btree block with the bnobt
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index f1260b4bfdee..90527b094878 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -574,6 +574,11 @@ xchk_da_btree(
/* Drill another level deeper. */
blkno = be32_to_cpu(key->before);
level++;
+ if (level >= XFS_DA_NODE_MAXDEPTH) {
+ /* Too deep! */
+ xchk_da_set_corrupt(&ds, level - 1);
+ break;
+ }
ds.tree_level--;
error = xchk_da_btree_block(&ds, level, blkno);
if (error)
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 93f07edafd81..9ee2a7d02e70 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -161,6 +161,14 @@ xfs_ioc_trim(
return -EPERM;
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
+
+ /*
+ * We haven't recovered the log, so we cannot use our bnobt-guided
+ * storage zapping commands.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return -EROFS;
+
if (copy_from_user(&range, urange, sizeof(range)))
return -EFAULT;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f2e2845eb76..a7ceae90110e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -529,18 +529,17 @@ xfs_file_dio_aio_write(
count = iov_iter_count(from);
/*
- * If we are doing unaligned IO, wait for all other IO to drain,
- * otherwise demote the lock if we had to take the exclusive lock
- * for other reasons in xfs_file_aio_write_checks.
+ * If we are doing unaligned IO, we can't allow any other overlapping IO
+ * in-flight at the same time or we risk data corruption. Wait for all
+ * other IO to drain before we submit. If the IO is aligned, demote the
+ * iolock if we had to take the exclusive lock in
+ * xfs_file_aio_write_checks() for other reasons.
*/
if (unaligned_io) {
- /* If we are going to wait for other DIO to finish, bail */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (atomic_read(&inode->i_dio_count))
- return -EAGAIN;
- } else {
- inode_dio_wait(inode);
- }
+ /* unaligned dio always waits, bail */
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_dio_wait(inode);
} else if (iolock == XFS_IOLOCK_EXCL) {
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
@@ -548,6 +547,14 @@ xfs_file_dio_aio_write(
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
+
+ /*
+ * If unaligned, this is the only IO in-flight. If it has not yet
+ * completed, wait on it before we release the iolock to prevent
+ * subsequent overlapping IO.
+ */
+ if (ret == -EIOCBQUEUED && unaligned_io)
+ inode_dio_wait(inode);
out:
xfs_iunlock(ip, iolock);